Learn how to stream content generation responses in real-time using synchronous and asynchronous streaming with the Google Gen AI Python SDK.
Streaming allows the model to send responses back incrementally as they’re generated, rather than waiting for the complete response. This is ideal for better user experience in interactive applications.
The generate_content_stream method returns an iterator that yields chunks as they arrive:
from google import genaiclient = genai.Client(api_key='your-api-key')for chunk in client.models.generate_content_stream( model='gemini-2.5-flash', contents='Tell me a story in 300 words.'): print(chunk.text, end='')
Notice the end='' parameter in print() - this prevents adding newlines between chunks and creates a smooth streaming effect.
import asynciofrom google import genaiclient = genai.Client(api_key='your-api-key')async def stream_content(): async for chunk in await client.aio.models.generate_content_stream( model='gemini-2.5-flash', contents='Tell me a story in 300 words.' ): print(chunk.text, end='')asyncio.run(stream_content())
import asyncioasync def generate_content(): response = await client.aio.models.generate_content( model='gemini-2.5-flash', contents='Tell me a story in 300 words.' ) print(response.text)asyncio.run(generate_content())
Each chunk in the stream has the same structure as a complete response:
for chunk in client.models.generate_content_stream( model='gemini-2.5-flash', contents='Explain quantum computing'): # Access text directly print(chunk.text, end='') # Or access parts for part in chunk.parts: if part.text: print(part.text, end='') # Access usage metadata (available on final chunk) if hasattr(chunk, 'usage_metadata'): print(f"\nTokens used: {chunk.usage_metadata.total_token_count}")
chat = client.chats.create(model='gemini-2.5-flash')for chunk in chat.send_message_stream('tell me a story'): print(chunk.text, end='')print("\n---")for chunk in chat.send_message_stream('summarize it in one sentence'): print(chunk.text, end='')
import asyncioasync def chat_stream(): chat = await client.aio.chats.create(model='gemini-2.5-flash') async for chunk in await chat.send_message_stream('tell me a story'): print(chunk.text, end='') print("\n---") async for chunk in await chat.send_message_stream('summarize it in one sentence'): print(chunk.text, end='')asyncio.run(chat_stream())
You can apply all standard configuration options to streaming:
from google.genai import typesfor chunk in client.models.generate_content_stream( model='gemini-2.5-flash', contents='Write a poem about coding', config=types.GenerateContentConfig( temperature=0.9, max_output_tokens=500, system_instruction='You are a creative poet who writes in rhyme.', ),): print(chunk.text, end='')
Different strategies for handling streamed content:
Immediate Display
Accumulate and Display
Sentence-by-Sentence
Display each chunk immediately (best for chatbots):
for chunk in client.models.generate_content_stream( model='gemini-2.5-flash', contents='Tell me a story'): print(chunk.text, end='', flush=True)
Accumulate the full response while streaming:
full_response = []for chunk in client.models.generate_content_stream( model='gemini-2.5-flash', contents='Tell me a story'): full_response.append(chunk.text) print(chunk.text, end='', flush=True)# Full response available after streamingcomplete_text = ''.join(full_response)print(f"\n\nTotal length: {len(complete_text)}")
Buffer and display complete sentences:
buffer = ""for chunk in client.models.generate_content_stream( model='gemini-2.5-flash', contents='Tell me a story'): buffer += chunk.text # Display when we have a complete sentence while '. ' in buffer or '! ' in buffer or '? ' in buffer: for delimiter in ['. ', '! ', '? ']: if delimiter in buffer: sentence, buffer = buffer.split(delimiter, 1) print(sentence + delimiter) break# Print any remaining textif buffer: print(buffer)
try: for chunk in client.models.generate_content_stream( model='gemini-2.5-flash', contents='Tell me a story' ): print(chunk.text, end='', flush=True)except Exception as e: print(f"\n\nStreaming error: {e}")