Container Monitoring
Multi-Cloud Manager provides comprehensive monitoring capabilities for containers across Azure Container Instances and GCP Cloud Run. Track metrics, query logs, and analyze performance in real-time.
Azure Container Instances Monitoring
Get Container Metrics
Retrieve CPU and memory metrics for Azure Container Instances.
Endpoint
GET /api/azure/containers/{containerName}/metrics
Parameters
Container group name to monitor
Response
{
"subscriptionId" : "12345678-1234-1234-1234-123456789abc" ,
"resourceGroup" : "my-rg" ,
"resourceId" : "/subscriptions/.../resourceGroups/my-rg/providers/Microsoft.ContainerInstance/containerGroups/my-container" ,
"location" : "westeurope" ,
"containerName" : "my-container" ,
"metrics" : [
{
"name" : "CpuUsage" ,
"unit" : "Count" ,
"data" : [
{
"timestamp" : "2024-01-20T10:00:00Z" ,
"average" : 0.45
},
{
"timestamp" : "2024-01-20T10:01:00Z" ,
"average" : 0.52
}
]
},
{
"name" : "MemoryUsage" ,
"unit" : "Bytes" ,
"data" : [
{
"timestamp" : "2024-01-20T10:00:00Z" ,
"average" : 536870912
},
{
"timestamp" : "2024-01-20T10:01:00Z" ,
"average" : 541065216
}
]
}
]
}
Implementation
from azure.monitor.query import MetricsQueryClient, MetricAggregationType
from datetime import datetime, timedelta
def aci_monitor_metrics ( container_group_name ):
credential = FlaskCredential()
# Find container across subscriptions
aci_info = _find_container_details(container_group_name, credential)
if not aci_info:
return jsonify({
"error" : f "Nie znaleziono kontenera ACI o nazwie ' { container_group_name } '."
}), 404
resource_id = aci_info.get( "resourceId" )
# Query metrics
client = MetricsQueryClient(credential)
end_time = datetime.utcnow()
start_time = end_time - timedelta( hours = 1 )
metric_names = [ "CpuUsage" , "MemoryUsage" ]
response = client.query_resource(
resource_uri = resource_id,
metric_names = metric_names,
timespan = (start_time, end_time),
interval = "PT1M" , # 1-minute intervals
aggregations = [MetricAggregationType. AVERAGE ]
)
# Format metrics data
metrics_data = []
for metric in response.metrics:
datapoints = []
for series in metric.timeseries:
for val in series.data:
if val.average is not None :
datapoints.append({
"timestamp" : val.timestamp.isoformat(),
"average" : round (val.average, 2 )
})
metrics_data.append({
"name" : metric.name,
"unit" : str (metric.unit),
"data" : datapoints
})
aci_info[ "metrics" ] = metrics_data
return jsonify(aci_info), 200
Available Metrics
CpuUsage:
CPU core usage (0.0 to CPU limit)
Aggregated across all containers in the group
Sampled every minute
MemoryUsage:
Memory usage in bytes
Includes all containers in the group
Sampled every minute
Metrics are retained for 93 days in Azure Monitor. Historical data older than 93 days is not available.
Get Linked Log Analytics Workspace
Retrieve the Log Analytics workspace associated with a container.
Endpoint
GET /api/azure/containers/{containerName}/workspace
Response
{
"value" : {
"id" : "/subscriptions/.../resourceGroups/my-rg/providers/Microsoft.OperationalInsights/workspaces/my-workspace" ,
"name" : "my-workspace" ,
"location" : "westeurope" ,
"workspaceGuid" : "12345678-1234-1234-1234-123456789abc"
}
}
Implementation
from azure.mgmt.loganalytics import LogAnalyticsManagementClient
def get_aci_linked_workspace ( container_group_name ):
credential = FlaskCredential()
aci_info = _find_container_details(container_group_name, credential)
if not aci_info:
return jsonify({
"error" : f "Nie znaleziono kontenera ACI o nazwie ' { container_group_name } '."
}), 404
sub_id = aci_info.get( "subscriptionId" )
rg_name = aci_info.get( "resourceGroup" )
aci_client = ContainerInstanceManagementClient(credential, sub_id)
container_group = aci_client.container_groups.get(rg_name, container_group_name)
# Check if Log Analytics is configured
if (container_group.diagnostics and
container_group.diagnostics.log_analytics and
container_group.diagnostics.log_analytics.workspace_id):
workspace_resource_id = container_group.diagnostics.log_analytics.workspace_id
# Parse workspace ID and get details
log_analytics_client = LogAnalyticsManagementClient(credential, sub_id)
workspaces = log_analytics_client.workspaces.list_by_resource_group(rg_name)
for ws in workspaces:
if ws.customer_id == workspace_resource_id:
result = {
"id" : ws.id,
"name" : ws.name,
"location" : ws.location,
"workspaceGuid" : ws.customer_id
}
return jsonify({ "value" : result}), 200
else :
return jsonify({ "value" : None }), 200
Container instances must be configured with Log Analytics diagnostics to enable log querying. Configure this when creating the container group.
Query Container Logs
Execute KQL (Kusto Query Language) queries against container logs.
Endpoint
POST /api/azure/containers/{containerName}/logs/query
Request Body
{
"workspaceGuid" : "12345678-1234-1234-1234-123456789abc" ,
"kqlQuery" : "ContainerInstanceLog_CL | where ContainerGroup_s == 'my-container' | top 100 by TimeGenerated desc"
}
Parameters
Log Analytics workspace GUID (customer ID)
KQL query to execute (must filter by container name)
Response
{
"columns" : [ "TimeGenerated" , "ContainerGroup_s" , "Container_s" , "Message" ],
"rows" : [
[ "2024-01-20T10:30:00Z" , "my-container" , "app" , "Application started" ],
[ "2024-01-20T10:30:01Z" , "my-container" , "app" , "Processing request" ]
]
}
Implementation
from azure.identity import ClientSecretCredential
from azure.monitor.query import LogsQueryClient, LogsQueryStatus
from datetime import timedelta
def run_kql_query ( container_group_name ):
data = request.get_json()
workspace_guid = data.get( "workspaceGuid" )
kql_query = data.get( "kqlQuery" )
# Validate query contains container filter
if f "ContainerGroup_s == ' { container_group_name } '" not in kql_query and \
f "ContainerGroup_s == \" { container_group_name } \" " not in kql_query:
return jsonify({
"error" : f "Zapytanie musi zawierać filtr 'ContainerGroup_s == \" { container_group_name } \" '."
}), 400
# Block dangerous keywords
dangerous_keywords = [ 'delete' , 'update' , 'modify' , 'insert' , 'drop' ]
if any (keyword in kql_query.lower() for keyword in dangerous_keywords):
return jsonify({ "error" : "Zapytanie zawiera niedozwolone słowa kluczowe." }), 400
credential = ClientSecretCredential(
tenant_id = TENANT_ID ,
client_id = CLIENT_ID ,
client_secret = CLIENT_SECRET
)
client = LogsQueryClient(credential)
response = client.query_workspace(
workspace_id = workspace_guid,
query = kql_query,
timespan = timedelta( days = 1 )
)
if response.status == LogsQueryStatus. SUCCESS and response.tables:
table = response.tables[ 0 ]
if not table.rows:
return jsonify({ "columns" : [], "rows" : []}), 200
columns = table.columns
rows = [
[item.isoformat() if isinstance (item, datetime) else str (item) for item in row]
for row in table.rows
]
return jsonify({ "columns" : columns, "rows" : rows}), 200
else :
return jsonify({
"error" : "Nie udało się wykonać zapytania." ,
"details" : str (response.partial_error)
}), 500
Example KQL Queries
Get recent logs:
ContainerInstanceLog_CL
| where ContainerGroup_s == 'my-container'
| top 100 by TimeGenerated desc
Filter by severity:
ContainerInstanceLog_CL
| where ContainerGroup_s == 'my-container'
| where Message contains 'ERROR'
| project TimeGenerated, Container_s, Message
Count logs by container:
ContainerInstanceLog_CL
| where ContainerGroup_s == 'my-container'
| summarize Count= count() by Container_s
Time series aggregation:
ContainerInstanceLog_CL
| where ContainerGroup_s == 'my-container'
| summarize Count= count() by bin (TimeGenerated, 5m )
| render timechart
Export Logs to CSV
Export container logs as CSV file.
Endpoint
GET /api/azure/containers/{containerName}/logs/export?workspaceGuid={guid}&hours={hours}&type={type}
Query Parameters
Log Analytics workspace GUID
Number of hours of logs to export
type
string
default: "container"
Log type to export (currently only “container” supported)
Response
CSV file download with filename {containerName}_logs.csv
Implementation
import io
import csv
from flask import Response
def export_aci_logs_csv ( container_group_name ):
workspace_guid = request.args.get( "workspaceGuid" )
timespan_hours = int (request.args.get( "hours" , 1 ))
log_type = request.args.get( "type" , "container" ).lower()
credential = ClientSecretCredential(
tenant_id = TENANT_ID ,
client_id = CLIENT_ID ,
client_secret = CLIENT_SECRET
)
client = LogsQueryClient(credential)
query = f """
ContainerInstanceLog_CL
| where ContainerGroup_s == ' { container_group_name } '
| top 500 by TimeGenerated desc
"""
response = client.query_workspace(
workspace_id = workspace_guid,
query = query,
timespan = timedelta( hours = timespan_hours)
)
if response.status == LogsQueryStatus. SUCCESS and response.tables:
table = response.tables[ 0 ]
if not table.rows:
return jsonify({ "message" : "Brak danych logów." }), 200
output = io.StringIO()
writer = csv.writer(output, delimiter = ';' )
writer.writerow(table.columns)
for row in table.rows:
row_data = [
item.isoformat() if isinstance (item, datetime) else str (item)
for item in row
]
writer.writerow(row_data)
csv_content = output.getvalue()
output.close()
return Response(
csv_content,
mimetype = "text/csv" ,
headers = {
"Content-Disposition" : f "attachment;filename= { container_group_name } _logs.csv"
}
)
GCP Cloud Run Monitoring
Find Container Details
Locate Cloud Run service across projects and regions.
Endpoint
GET /api/gcp/containers/{containerName}/details
Response
{
"projectId" : "my-project" ,
"region" : "europe-west1" ,
"serviceName" : "hello-service" ,
"resourceName" : "projects/my-project/locations/europe-west1/services/hello-service" ,
"url" : "https://hello-service-abc123-ew.a.run.app"
}
Get Available Metrics
Retrieve list of available metrics for Cloud Run.
Endpoint
GET /api/gcp/containers/{projectId}/{region}/{containerName}/metrics
Response
{
"metrics" : [
{
"type" : "run.googleapis.com/request_count" ,
"displayName" : "Liczba żądań" ,
"unit" : "count"
},
{
"type" : "run.googleapis.com/request_latencies" ,
"displayName" : "Opóźnienia żądań" ,
"unit" : "ms"
},
{
"type" : "run.googleapis.com/container/instance_count" ,
"displayName" : "Liczba instancji" ,
"unit" : "count"
}
]
}
Get Metric Data
Retrieve time-series data for specific metrics.
Endpoint
POST /api/gcp/containers/{projectId}/{region}/{containerName}/metrics/data
Request Body
{
"metricType" : "run.googleapis.com/request_count" ,
"timespanMinutes" : 60
}
Parameters
Metric type to query (e.g., run.googleapis.com/request_count)
Time range in minutes (default 60)
Response
{
"data" : [
{
"timestamp" : "2024-01-20T10:00:00Z" ,
"average" : 125.0
},
{
"timestamp" : "2024-01-20T10:01:00Z" ,
"average" : 142.0
}
]
}
Implementation
from google.cloud import monitoring_v3
import pytz
def get_gcp_container_metric_data ( project_id , region , container_name ):
accounts = session.get( "accounts" , [])
gcp_account = next (
(acc for acc in accounts if acc.get( "provider" ) == "gcp" ),
None
)
data = request.get_json()
metric_type = data.get( "metricType" )
timespan_minutes = int (data.get( "timespanMinutes" , 60 ))
credentials = SessionCredentials(gcp_account)
client = monitoring_v3.MetricServiceClient( credentials = credentials)
now = datetime.utcnow().replace( tzinfo = pytz. UTC )
interval = monitoring_v3.TimeInterval(
end_time = now,
start_time = now - timedelta( minutes = timespan_minutes)
)
project_name = f "projects/ { project_id } "
filter_query = (
f 'metric.type = " { metric_type } " AND '
f 'resource.type = "cloud_run_revision" AND '
f 'resource.labels.service_name = " { container_name } " AND '
f 'resource.labels.location = " { region } "'
)
# Choose aggregation based on metric type
aggregation_alignment = monitoring_v3.Aggregation.Aligner. ALIGN_MEAN
if "count" in metric_type.lower():
aggregation_alignment = monitoring_v3.Aggregation.Aligner. ALIGN_SUM
elif "latencies" in metric_type.lower():
aggregation_alignment = monitoring_v3.Aggregation.Aligner. ALIGN_PERCENTILE_95
req = {
"name" : project_name,
"filter" : filter_query,
"interval" : interval,
"view" : monitoring_v3.ListTimeSeriesRequest.TimeSeriesView. FULL ,
"aggregation" : {
"alignment_period" : { "seconds" : 60 },
"per_series_aligner" : aggregation_alignment
}
}
query_result = client.list_time_series( request = req)
data_points = []
for series in query_result:
for point in series.points:
value = None
if point.value.double_value != 0.0 :
value = point.value.double_value
elif point.value.int64_value != 0 :
value = point.value.int64_value
elif point.value.bool_value:
value = 1
else :
value = 0.0
if value is not None :
if "utilization" in metric_type.lower():
value = value * 100
data_points.append({
"timestamp" : point.interval.end_time.isoformat(),
"average" : round (value, 4 )
})
data_points.sort( key = lambda x : x[ 'timestamp' ])
return jsonify({ "data" : data_points}), 200
Query Logs
Query Cloud Run logs using Log Query Language (LQL).
Endpoint
POST /api/gcp/containers/{projectId}/{containerName}/logs/query
Request Body
{
"lqlQuery" : "resource.type= \" cloud_run_revision \" resource.labels.service_name= \" hello-service \" severity>=WARNING"
}
Parameters
LQL query (must include resource.labels.service_name filter)
Response
{
"columns" : [ "timestamp" , "severity" , "payload" , "message" ],
"rows" : [
[ "2024-01-20T10:30:00Z" , "WARNING" , "Slow request detected" , "Request took 2.5s" ],
[ "2024-01-20T10:30:15Z" , "ERROR" , "Database connection failed" , "Timeout after 30s" ]
]
}
Implementation
from google.cloud import logging_v2
def query_gcp_container_logs ( project_id , container_name ):
accounts = session.get( "accounts" , [])
gcp_account = next (
(acc for acc in accounts if acc.get( "provider" ) == "gcp" ),
None
)
data = request.get_json()
lql_query = data.get( "lqlQuery" )
# Validate query contains service filter
service_filter = f 'resource.labels.service_name=" { container_name } "'
if service_filter not in lql_query:
return jsonify({
"error" : f "Zapytanie LQL musi zawierać filtr: { service_filter } "
}), 400
credentials = SessionCredentials(gcp_account)
client = logging_v2.Client( credentials = credentials, project = project_id)
entries_iterator = client.list_entries(
filter_ = lql_query,
order_by = logging_v2. DESCENDING ,
page_size = 100
)
columns_set = set ([ "timestamp" , "severity" , "payload" ])
rows_data = []
for entry in entries_iterator:
row = {
"timestamp" : entry.timestamp.isoformat(),
"severity" : entry.severity,
}
payload = entry.payload
if isinstance (payload, dict ):
for key, value in payload.items():
columns_set.add(key)
row[key] = str (value)
elif isinstance (payload, str ):
row[ "payload" ] = payload
else :
row[ "payload" ] = str (payload) if payload is not None else "N/A"
rows_data.append(row)
if not rows_data:
return jsonify({ "columns" : [], "rows" : []}), 200
columns = sorted ( list (columns_set))
final_rows = []
for row in rows_data:
final_rows.append([row.get(col_name, "" ) for col_name in columns])
return jsonify({ "columns" : columns, "rows" : final_rows}), 200
Example LQL Queries
Get recent logs:
resource.type="cloud_run_revision"
resource.labels.service_name="hello-service"
Filter by severity:
resource.type="cloud_run_revision"
resource.labels.service_name="hello-service"
severity>=ERROR
Search log text:
resource.type="cloud_run_revision"
resource.labels.service_name="hello-service"
textPayload=~"database"
Time range filter:
resource.type="cloud_run_revision"
resource.labels.service_name="hello-service"
timestamp>="2024-01-20T00:00:00Z"
Best Practices
Azure Monitoring
Configure Log Analytics workspace during container creation
Set appropriate log retention policies (30-730 days)
Use KQL queries for efficient log analysis
Export logs to storage for long-term retention
Monitor metric trends to identify performance issues
GCP Monitoring
Enable Cloud Logging for all Cloud Run services
Use structured logging (JSON format) for better querying
Set up log sinks for long-term storage
Monitor request latency P95 and P99 percentiles
Use Cloud Trace for distributed tracing
Query shorter time ranges for faster results
Use aggregations instead of raw data points
Limit result sets to necessary data
Cache frequently accessed metrics
Use log sampling for high-volume applications
Container Alerts Configure alerts based on metrics and logs
Azure Containers Azure Container Instances management
GCP Containers GCP Cloud Run service management