Overview
Streaming allows you to receive model responses incrementally as they’re generated, rather than waiting for the complete response. This is implemented using Server-Sent Events (SSE) and is useful for real-time user interfaces.
Enabling Streaming
Set stream: true in your chat completions request:
import requests
import json
url = "http://localhost:8000/v1/chat/completions"
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Count from 1 to 10"}
],
"stream": True
}
response = requests.post(url, json=data, stream=True)
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
data_str = line[6:]
if data_str == '[DONE]':
break
chunk = json.loads(data_str)
delta = chunk['choices'][0]['delta']
if 'content' in delta:
print(delta['content'], end='', flush=True)
Initial Chunk
The first chunk contains the role:
data: {
"model": "gpt-3.5-turbo",
"object": "chat.completion.chunk",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant"
},
"finish_reason": null
}
],
"created": 1677652288
}
Content Chunks
Subsequent chunks contain content deltas:
data: {
"model": "gpt-3.5-turbo",
"object": "chat.completion.chunk",
"choices": [
{
"index": 0,
"delta": {
"content": "Hello"
},
"finish_reason": null
}
]
}
Final Chunk
The last chunk has an empty delta and a finish_reason:
data: {
"model": "gpt-3.5-turbo",
"object": "chat.completion.chunk",
"choices": [
{
"index": 0,
"delta": {},
"finish_reason": "stop"
}
]
}
Stream Termination
Response Fields
Always "chat.completion.chunk" for streaming responses
Incremental content update:
role: Present in first chunk only
content: Text content delta (not cumulative)
Null during generation, then one of:
"stop": Natural completion
"length": Reached max_length
Python Client Examples
Basic Streaming
import requests
import json
def stream_chat(messages):
url = "http://localhost:8000/v1/chat/completions"
response = requests.post(
url,
json={
"model": "gpt-3.5-turbo",
"messages": messages,
"stream": True
},
stream=True
)
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
data = line[6:]
if data == '[DONE]':
print() # New line at end
break
chunk = json.loads(data)
delta = chunk['choices'][0]['delta']
if 'content' in delta:
print(delta['content'], end='', flush=True)
messages = [
{"role": "user", "content": "Write a haiku about programming"}
]
stream_chat(messages)
With OpenAI SDK
import openai
openai.api_base = "http://localhost:8000/v1"
openai.api_key = "none"
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Tell me a story"}
],
stream=True
)
for chunk in response:
delta = chunk.choices[0].delta
if hasattr(delta, 'content'):
print(delta.content, end='', flush=True)
print() # New line
Collecting Full Response
import requests
import json
def stream_and_collect(messages):
url = "http://localhost:8000/v1/chat/completions"
response = requests.post(
url,
json={
"model": "gpt-3.5-turbo",
"messages": messages,
"stream": True
},
stream=True
)
full_response = ""
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
data = line[6:]
if data == '[DONE]':
break
chunk = json.loads(data)
delta = chunk['choices'][0]['delta']
if 'content' in delta:
content = delta['content']
full_response += content
print(content, end='', flush=True)
print() # New line
return full_response
response = stream_and_collect([
{"role": "user", "content": "Explain async programming"}
])
print(f"\nFull response length: {len(response)} characters")
JavaScript Example
async function streamChat(messages) {
const response = await fetch('http://localhost:8000/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: 'gpt-3.5-turbo',
messages: messages,
stream: true,
}),
});
const reader = response.body.getReader();
const decoder = new TextDecoder('utf-8');
let buffer = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split('\n');
buffer = lines.pop();
for (const line of lines) {
if (line.startsWith('data: ')) {
const data = line.slice(6);
if (data === '[DONE]') {
return;
}
const chunk = JSON.parse(data);
const delta = chunk.choices[0].delta;
if (delta.content) {
process.stdout.write(delta.content);
}
}
}
}
}
streamChat([
{ role: 'user', content: 'Write a poem about AI' }
]);
Limitations
Function Calling Not Supported
Streaming does not support function calling:
# This will return HTTP 400
response = requests.post(
"http://localhost:8000/v1/chat/completions",
json={
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Hello"}],
"stream": True,
"functions": [{"name": "test", "parameters": {}}] # Not allowed
}
)
Error response:
{
"detail": "Invalid request: Function calling is not yet implemented for stream mode."
}
Best Practices
Buffer Management
Handle partial lines in streams:
def stream_with_buffer(response):
buffer = ""
for chunk in response.iter_content(decode_unicode=True):
buffer += chunk
while '\n' in buffer:
line, buffer = buffer.split('\n', 1)
if line.startswith('data: '):
process_line(line)
Error Handling
import requests
import json
def safe_stream(messages):
try:
response = requests.post(
"http://localhost:8000/v1/chat/completions",
json={
"model": "gpt-3.5-turbo",
"messages": messages,
"stream": True
},
stream=True,
timeout=30
)
response.raise_for_status()
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
data = line[6:]
if data == '[DONE]':
break
try:
chunk = json.loads(data)
delta = chunk['choices'][0]['delta']
if 'content' in delta:
yield delta['content']
except json.JSONDecodeError:
print(f"Warning: Invalid JSON: {data}")
continue
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return
for content in safe_stream([{"role": "user", "content": "Hello"}]):
print(content, end='', flush=True)
Stop Word Handling
The streaming implementation includes a delay buffer to properly handle stop words. The last few tokens may be held back temporarily to check for stop sequences before being yielded.