diff --git a/codebundles/azure-servicebus-health/service_bus_metrics.sh b/codebundles/azure-servicebus-health/service_bus_metrics.sh index 6c0fe638a..81a4e67f0 100755 --- a/codebundles/azure-servicebus-health/service_bus_metrics.sh +++ b/codebundles/azure-servicebus-health/service_bus_metrics.sh @@ -98,37 +98,236 @@ add_issue() { # Check for server errors server_errors=$(jq -r '.ServerErrors.value[0].timeseries[0].data | map(select(.total > 0)) | length' <<< "$metrics_data") if [[ "$server_errors" -gt 0 ]]; then + # Get detailed error metrics + total_errors=$(jq -r '.ServerErrors.value[0].timeseries[0].data | map(.total // 0) | add // 0' <<< "$metrics_data") + max_errors=$(jq -r '.ServerErrors.value[0].timeseries[0].data | map(.maximum // 0) | max' <<< "$metrics_data") + avg_errors=$(jq -r '.ServerErrors.value[0].timeseries[0].data | map(.average // 0) | add / length' <<< "$metrics_data") + add_issue 1 \ - "Service Bus namespace $SB_NAMESPACE_NAME has server errors" \ + "Service Bus namespace $SB_NAMESPACE_NAME has $total_errors server errors" \ "Investigate service bus logs for the specific errors and consider opening a support case with Microsoft" \ - "Server errors detected in metrics" + "SERVER ERROR ANALYSIS: +- Total Server Errors: $total_errors (over $METRIC_INTERVAL interval) +- Maximum Errors in Single Period: $max_errors +- Average Errors: $avg_errors +- Namespace: $SB_NAMESPACE_NAME +- Resource Group: $AZ_RESOURCE_GROUP +- Metric Interval: $METRIC_INTERVAL + +CONTEXT: Server errors indicate internal Azure Service Bus issues that are not caused by client applications. These are service-side failures that can impact message reliability and availability. Server errors can result from: +1. Azure infrastructure issues or service degradation +2. Resource exhaustion at the service level +3. Internal Service Bus component failures +4. Transient network or storage issues in Azure backend +5. Service deployment or maintenance activities + +INVESTIGATION STEPS: +1. Check Azure Service Health dashboard for any ongoing incidents +2. Review Azure Activity Log for service-level events +3. Query Service Bus diagnostic logs for detailed error information: + - OperationalLogs for runtime operations + - RuntimeAuditLogs for audit information +4. Check metrics for correlation with other issues (throttling, connection drops) +5. Verify if errors are transient or persistent +6. Note exact timestamps of errors for Microsoft support investigation +7. Review retry policies in client applications + +RECOMMENDATIONS: +- Open Azure support case if errors are persistent (severity 1) +- Implement exponential backoff retry logic in client applications +- Enable diagnostic logging for detailed error tracking +- Set up alerts for server error rate thresholds +- Document error patterns and frequencies for support case +- Consider implementing circuit breaker patterns in applications +- Review namespace health and consider failover strategies if available + +BUSINESS IMPACT: Server errors can cause message delivery failures, data loss, and service disruptions. As these are service-side issues, they require Azure support intervention if persistent. Critical severity issue requiring immediate attention." fi # Check for throttling throttled=$(jq -r '.ThrottledRequests.value[0].timeseries[0].data | map(select(.total > 0)) | length' <<< "$metrics_data") if [[ "$throttled" -gt 0 ]]; then + # Get detailed throttling metrics + total_throttled=$(jq -r '.ThrottledRequests.value[0].timeseries[0].data | map(.total // 0) | add // 0' <<< "$metrics_data") + max_throttled=$(jq -r '.ThrottledRequests.value[0].timeseries[0].data | map(.maximum // 0) | max' <<< "$metrics_data") + incoming_msgs=$(jq -r '.IncomingMessages.value[0].timeseries[0].data | map(.total // 0) | add // 0' <<< "$metrics_data") + outgoing_msgs=$(jq -r '.OutgoingMessages.value[0].timeseries[0].data | map(.total // 0) | add // 0' <<< "$metrics_data") + active_connections=$(jq -r '.ActiveConnections.value[0].timeseries[0].data | map(.maximum // 0) | max' <<< "$metrics_data") + add_issue 3 \ - "Service Bus namespace $SB_NAMESPACE_NAME is experiencing throttling" \ + "Service Bus namespace $SB_NAMESPACE_NAME is experiencing $total_throttled throttled requests" \ "Consider upgrading the SKU or scaling up capacity units if this is a persistent issue" \ - "Throttling detected in metrics" + "THROTTLING ANALYSIS: +- Total Throttled Requests: $total_throttled (over $METRIC_INTERVAL interval) +- Maximum Throttled in Single Period: $max_throttled +- Total Incoming Messages: $incoming_msgs +- Total Outgoing Messages: $outgoing_msgs +- Maximum Active Connections: $active_connections +- Namespace: $SB_NAMESPACE_NAME +- Resource Group: $AZ_RESOURCE_GROUP +- Metric Interval: $METRIC_INTERVAL + +CONTEXT: Throttling occurs when the namespace exceeds its allocated throughput limits based on the pricing tier (Basic, Standard, or Premium) and configured throughput units (TU) or messaging units (MU). Azure Service Bus enforces limits on: +1. Operations per second (send, receive, management operations) +2. Number of concurrent connections +3. Message size and throughput (MB/s) +4. CPU usage (Premium tier) + +Throttling causes: +- Failed send/receive operations requiring retry +- Increased latency and degraded application performance +- Potential message loss if retry logic is insufficient +- Application errors and timeouts + +INVESTIGATION STEPS: +1. Check current namespace pricing tier and capacity allocation +2. Review namespace metrics for specific throttling patterns: + - High incoming message rate relative to tier limits + - Connection count approaching tier maximums + - CPU usage if Premium tier +3. Analyze application request patterns for spikes or sustained high load +4. Verify if throttling coincides with specific business events +5. Check if multiple queues/topics are competing for namespace throughput +6. Review client application retry logic and backoff strategies +7. Verify namespace scaling configuration + +RECOMMENDATIONS: +- Upgrade to higher pricing tier (Standard to Premium) for increased throughput +- Scale up by adding throughput units (Standard) or messaging units (Premium) +- Enable auto-scaling if using Premium tier +- Implement efficient connection pooling in client applications +- Use batching for send/receive operations to reduce operation count +- Distribute load across multiple namespaces if possible +- Optimize message size to reduce bandwidth consumption +- Implement proper exponential backoff retry policies +- Consider Premium tier for guaranteed performance and isolation + +BUSINESS IMPACT: Throttling causes message delivery delays, application timeouts, and potential data loss. Performance degradation affects user experience and business operations. High priority issue requiring capacity planning review." fi # Check for high user errors user_errors=$(jq -r '.UserErrors.value[0].timeseries[0].data | map(select(.total > 10)) | length' <<< "$metrics_data") if [[ "$user_errors" -gt 0 ]]; then + # Get detailed user error metrics + total_user_errors=$(jq -r '.UserErrors.value[0].timeseries[0].data | map(.total // 0) | add // 0' <<< "$metrics_data") + max_user_errors=$(jq -r '.UserErrors.value[0].timeseries[0].data | map(.maximum // 0) | max' <<< "$metrics_data") + avg_user_errors=$(jq -r '.UserErrors.value[0].timeseries[0].data | map(.average // 0) | add / length' <<< "$metrics_data") + total_requests=$(jq -r '(.IncomingMessages.value[0].timeseries[0].data | map(.total // 0) | add // 0) + (.OutgoingMessages.value[0].timeseries[0].data | map(.total // 0) | add // 0)' <<< "$metrics_data") + add_issue 3 \ - "Service Bus namespace $SB_NAMESPACE_NAME has a high number of user errors" \ + "Service Bus namespace $SB_NAMESPACE_NAME has $total_user_errors user errors" \ "Review application logs and SAS key policies to ensure proper authentication and permissions" \ - "High user error count detected in metrics" + "USER ERROR ANALYSIS: +- Total User Errors: $total_user_errors (over $METRIC_INTERVAL interval) +- Maximum Errors in Single Period: $max_user_errors +- Average Errors: $avg_user_errors +- Total Requests (Incoming + Outgoing): $total_requests +- Error Rate: $(awk "BEGIN {printf \"%.2f%%\", ($total_user_errors / ($total_requests + $total_user_errors)) * 100}") +- Namespace: $SB_NAMESPACE_NAME +- Resource Group: $AZ_RESOURCE_GROUP +- Metric Interval: $METRIC_INTERVAL + +CONTEXT: User errors indicate client-side issues with Service Bus operations. These are errors caused by incorrect usage, authentication problems, or authorization failures. Common user error scenarios include: +1. Authentication failures (invalid or expired SAS tokens, connection strings) +2. Authorization errors (insufficient permissions on queues/topics/subscriptions) +3. Invalid operations (operating on non-existent entities) +4. Protocol violations or malformed requests +5. Message size or property limit violations +6. Lock token expiration or invalid lock tokens +7. Message format or encoding issues + +INVESTIGATION STEPS: +1. Enable detailed diagnostic logging for the namespace +2. Review application logs for specific error codes and messages +3. Check common Service Bus error codes: + - 401 Unauthorized: Authentication issues + - 403 Forbidden: Authorization issues + - 404 Not Found: Entity doesn't exist + - 400 Bad Request: Invalid operation or message +4. Verify SAS policies and connection strings are current +5. Check entity names in application code match actual entities +6. Review message content for size/property violations +7. Verify client SDK versions are up to date +8. Check for concurrent operations on same messages (lock conflicts) + +RECOMMENDATIONS: +- Implement proper error handling and logging in applications +- Rotate and validate SAS tokens and connection strings +- Review and update access policies (Manage, Send, Listen permissions) +- Add retry logic with exponential backoff for transient errors +- Validate message payloads before sending +- Update client SDKs to latest versions +- Implement connection string validation on application startup +- Set up monitoring for specific error code patterns +- Use Managed Identity authentication to avoid token expiration issues + +BUSINESS IMPACT: High user error rates indicate application configuration or code issues that cause operation failures. While not service-level issues, they result in failed message operations, potential data loss, and degraded application functionality requiring developer attention." fi # Check for namespace size usage size_percent=$(jq -r '.Size.value[0].timeseries[0].data | map(.maximum) | max // 0' <<< "$metrics_data") if (( $(echo "$size_percent > 80" | bc -l) )); then + # Get additional storage metrics + avg_size=$(jq -r '.Size.value[0].timeseries[0].data | map(.average // 0) | add / length' <<< "$metrics_data") + max_size=$(jq -r '.Size.value[0].timeseries[0].data | map(.maximum // 0) | max' <<< "$metrics_data") + incoming_msgs=$(jq -r '.IncomingMessages.value[0].timeseries[0].data | map(.total // 0) | add // 0' <<< "$metrics_data") + outgoing_msgs=$(jq -r '.OutgoingMessages.value[0].timeseries[0].data | map(.total // 0) | add // 0' <<< "$metrics_data") + + # Calculate message imbalance (use bc for float-safe arithmetic) + msg_imbalance=$(echo "$incoming_msgs - $outgoing_msgs" | bc -l) + add_issue 3 \ - "Service Bus namespace $SB_NAMESPACE_NAME is approaching storage limit (${size_percent}%)" \ + "Service Bus namespace $SB_NAMESPACE_NAME is approaching storage limit at ${size_percent}%" \ "Consider implementing a message purging strategy or increasing the namespace size limit" \ - "Storage usage exceeding 80%" + "NAMESPACE STORAGE CAPACITY ANALYSIS: +- Current Storage Usage: ${size_percent}% (maximum observed) +- Average Storage Usage: ${avg_size}% +- Peak Storage Usage: ${max_size}% +- Total Incoming Messages: $incoming_msgs +- Total Outgoing Messages: $outgoing_msgs +- Message Imbalance: $msg_imbalance (incoming - outgoing) +- Namespace: $SB_NAMESPACE_NAME +- Resource Group: $AZ_RESOURCE_GROUP +- Metric Interval: $METRIC_INTERVAL + +CONTEXT: Namespace storage capacity limits vary by pricing tier and can be exhausted by accumulated messages, dead-letter messages, and scheduled messages across all queues and topics. Storage limit warnings indicate: +1. Messages accumulating faster than being processed/deleted +2. Large message sizes consuming storage quota +3. Dead-letter queues filling up with unprocessed messages +4. Scheduled messages awaiting delivery +5. Insufficient consumer throughput across multiple entities + +When storage limit is reached: +- New messages will be rejected (QUOTA_EXCEEDED errors) +- Publishers will experience send failures +- Applications will fail unless retry logic is implemented +- Service disruption for all queues and topics in namespace + +INVESTIGATION STEPS: +1. Identify queues and topics with highest message counts +2. Check dead-letter queues for accumulated failed messages +3. Review message TTL settings across all entities +4. Analyze message sizes and identify large messages +5. Verify consumer applications are running and processing messages +6. Check for inactive or abandoned subscriptions holding messages +7. Review scheduled message counts and delivery times +8. Examine auto-delete on idle configurations +9. Monitor storage growth rate to predict capacity exhaustion time + +RECOMMENDATIONS: +- Immediate: Increase namespace storage quota if available for tier +- Scale consumers to process message backlog faster +- Implement aggressive message cleanup after processing +- Configure appropriate message TTL to auto-expire old messages +- Investigate and clear dead-letter queues +- Remove or disable inactive subscriptions +- Enable auto-delete on idle for unused entities +- Consider message archival strategy for long-term retention +- Upgrade to higher pricing tier for increased storage capacity +- Distribute load across multiple namespaces if needed +- Implement monitoring for per-queue/topic storage usage + +BUSINESS IMPACT: Storage capacity exhaustion will cause complete service disruption with message rejection and publisher failures across all entities in the namespace. Critical issue requiring immediate action to prevent outage." fi # Write issues to output file diff --git a/codebundles/azure-servicebus-health/service_bus_queue_health.sh b/codebundles/azure-servicebus-health/service_bus_queue_health.sh index bfdbd9b06..3cd95fc04 100755 --- a/codebundles/azure-servicebus-health/service_bus_queue_health.sh +++ b/codebundles/azure-servicebus-health/service_bus_queue_health.sh @@ -75,10 +75,39 @@ add_issue() { disabled_queues=$(jq -r '[.[] | select(.status == "Disabled") | .name] | join(", ")' <<< "$queues") disabled_at=$(jq -r '[.[] | select(.status == "Disabled") | .updatedAt] | join(", ")' <<< "$queues") if [[ -n "$disabled_queues" ]]; then + disabled_count=$(jq -r '[.[] | select(.status == "Disabled")] | length' <<< "$queues") add_issue 3 \ - "Service Bus namespace $SB_NAMESPACE_NAME has disabled queues: $disabled_queues disabled at $disabled_at" \ + "Service Bus namespace $SB_NAMESPACE_NAME has $disabled_count disabled queue(s): $disabled_queues" \ "Investigate why these queues are disabled and enable them if needed" \ - "Disabled queues detected" \ + "DISABLED QUEUE ANALYSIS: +- Disabled Queue(s): $disabled_queues +- Count: $disabled_count +- Last Updated: $disabled_at +- Namespace: $SB_NAMESPACE_NAME +- Resource Group: $AZ_RESOURCE_GROUP + +CONTEXT: Disabled queues cannot send or receive messages, which disrupts message flow and can cause application failures. Queues may be disabled: +1. Manually by administrators during maintenance +2. Automatically by Azure due to policy violations +3. As a result of subscription or namespace suspension +4. Due to quota exhaustions or security concerns + +INVESTIGATION STEPS: +1. Check Azure portal for queue status and any warning messages +2. Review Azure Activity Log for who disabled the queue and when +3. Verify no ongoing maintenance or security incidents +4. Check for any namespace-level issues affecting multiple queues +5. Review application logs for errors around the disabled time ($disabled_at) +6. Verify queue configuration and policies are correct + +RECOMMENDATIONS: +- Re-enable queues if disabled unintentionally +- Document maintenance windows if intentionally disabled +- Implement monitoring alerts for queue status changes +- Review access control to prevent unauthorized modifications +- Verify applications have proper error handling for disabled queues + +BUSINESS IMPACT: Disabled queues cause message delivery failures, application errors, and disrupted business workflows requiring immediate attention." \ "$disabled_at" fi @@ -165,10 +194,59 @@ BUSINESS IMPACT: Failed message processing may result in data loss, delayed oper active_count=0 fi if [[ "$active_count" -gt "${ACTIVE_MESSAGE_THRESHOLD:-1000}" ]]; then + # Get additional context for backlog analysis + scheduled_count=$(jq -r '.countDetails.scheduledMessageCount // 0' <<< "$queue_details") + transfer_dead_letter_count=$(jq -r '.countDetails.transferDeadLetterMessageCount // 0' <<< "$queue_details") + transfer_count=$(jq -r '.countDetails.transferMessageCount // 0' <<< "$queue_details") + queue_status=$(jq -r '.status' <<< "$queue_details") + max_delivery_count=$(jq -r '.maxDeliveryCount' <<< "$queue_details") + lock_duration=$(jq -r '.lockDuration' <<< "$queue_details") + + # Ensure all counts are valid numbers + if ! [[ "$scheduled_count" =~ ^[0-9]+$ ]]; then scheduled_count=0; fi + if ! [[ "$transfer_dead_letter_count" =~ ^[0-9]+$ ]]; then transfer_dead_letter_count=0; fi + if ! [[ "$transfer_count" =~ ^[0-9]+$ ]]; then transfer_count=0; fi + add_issue 3 \ - "Queue \`$queue_name\` has $active_count active messages" \ + "Queue $queue_name has $active_count active messages" \ "Verify consumers are processing messages at an adequate rate" \ - "Large number of active messages in queue: $queue_name" + "MESSAGE BACKLOG ANALYSIS: +- Active Messages: $active_count (exceeds threshold of ${ACTIVE_MESSAGE_THRESHOLD:-1000}) +- Scheduled Messages: $scheduled_count +- Transfer Messages: $transfer_count +- Transfer Dead Letter Messages: $transfer_dead_letter_count +- Queue Name: $queue_name +- Queue Status: $queue_status +- Max Delivery Count: $max_delivery_count +- Lock Duration: $lock_duration + +CONTEXT: Large active message counts indicate a processing backlog where messages are arriving faster than they can be consumed. This suggests: +1. Consumer throughput is insufficient for current message volume +2. Consumer applications may be down or experiencing performance issues +3. Message processing logic may be too slow or resource-intensive +4. Scaling issues with consumer infrastructure +5. Lock timeout issues preventing efficient message processing + +INVESTIGATION STEPS: +1. Check consumer application health and availability +2. Monitor consumer processing rates and performance metrics +3. Verify consumer scaling configuration (auto-scaling, instance counts) +4. Analyze message processing duration and identify bottlenecks +5. Review consumer resource utilization (CPU, memory, network) +6. Check for any consumer application errors or exceptions in logs +7. Verify lock duration ($lock_duration) is appropriate for processing time +8. Check if max delivery count ($max_delivery_count) is being reached frequently + +RECOMMENDATIONS: +- Scale out consumer instances if processing is CPU/memory bound +- Optimize message processing logic for better throughput +- Implement consumer health monitoring and alerting +- Consider message batching if supported by your application +- Review queue configuration (prefetch count, session handling) +- Adjust lock duration if messages are timing out during processing +- Implement circuit breaker patterns for resilience + +BUSINESS IMPACT: Message processing delays can lead to degraded user experience, delayed business operations, and potential SLA violations." fi # Check if queue is close to max size @@ -178,10 +256,50 @@ BUSINESS IMPACT: Failed message processing may result in data loss, delayed oper size_percent=$(( (size_bytes * 100) / max_size_bytes )) if [[ "$size_percent" -gt "${SIZE_PERCENTAGE_THRESHOLD:-80}" ]]; then + # Get additional context for size analysis + max_size_mb=$(jq -r '.maxSizeInMegabytes' <<< "$queue_details") + message_count=$(jq -r '.countDetails.activeMessageCount // 0' <<< "$queue_details") + auto_delete_idle=$(jq -r '.autoDeleteOnIdle' <<< "$queue_details") + enable_partitioning=$(jq -r '.enablePartitioning' <<< "$queue_details") + add_issue 3 \ - "Queue \`$queue_name\` is at ${size_percent}% of maximum size" \ + "Queue $queue_name is at ${size_percent}% of maximum size" \ "Consider implementing auto-delete of processed messages or increasing queue size" \ - "Queue approaching size limit: $queue_name ($size_percent%)" + "QUEUE SIZE CAPACITY ANALYSIS: +- Current Size: $size_bytes bytes (${size_percent}% of capacity) +- Maximum Size: $max_size_mb MB ($max_size_bytes bytes) +- Active Message Count: $message_count +- Queue Name: $queue_name +- Auto Delete on Idle: $auto_delete_idle +- Partitioning Enabled: $enable_partitioning + +CONTEXT: Queue approaching storage capacity limit indicates that messages are accumulating faster than they are being consumed or are not being deleted after processing. This can lead to: +1. Queue throttling or message rejection when limit is reached +2. Application failures due to inability to send new messages +3. Increased latency in message processing +4. Potential data loss if messages are rejected +5. Service disruption for message producers + +INVESTIGATION STEPS: +1. Verify consumer applications are actively processing and completing messages +2. Check if messages are being explicitly deleted after successful processing +3. Review message retention policies and auto-delete configuration +4. Analyze message size distribution to identify large messages +5. Check if dead-letter messages are contributing to size usage +6. Review historical growth patterns to predict when limit will be reached +7. Verify if partitioning is enabled and functioning correctly + +RECOMMENDATIONS: +- Increase queue maximum size if within namespace quota limits +- Implement aggressive message cleanup after successful processing +- Configure auto-delete on idle if appropriate: $auto_delete_idle +- Enable partitioning to increase throughput and capacity (currently: $enable_partitioning) +- Review message TTL settings to automatically expire old messages +- Consider message archival strategy for long-term retention needs +- Scale out consumers to process backlog faster +- Investigate and remove large or unnecessary messages + +BUSINESS IMPACT: Reaching queue capacity will cause message rejection, application errors, and potential data loss. Immediate action required to prevent service disruption." fi done diff --git a/codebundles/azure-servicebus-health/service_bus_topic_health.sh b/codebundles/azure-servicebus-health/service_bus_topic_health.sh index 8cd044222..328bd6fb2 100755 --- a/codebundles/azure-servicebus-health/service_bus_topic_health.sh +++ b/codebundles/azure-servicebus-health/service_bus_topic_health.sh @@ -76,10 +76,41 @@ add_issue() { disabled_topics=$(jq -r '[.[] | select(.status == "Disabled") | .name] | join(", ")' <<< "$topics") disabled_at=$(jq -r '[.[] | select(.status == "Disabled") | .updatedAt] | join(", ")' <<< "$topics") if [[ -n "$disabled_topics" ]]; then + disabled_count=$(jq -r '[.[] | select(.status == "Disabled")] | length' <<< "$topics") add_issue 3 \ - "Service Bus namespace \`$SB_NAMESPACE_NAME\` has disabled topics: $disabled_topics" \ + "Service Bus namespace $SB_NAMESPACE_NAME has $disabled_count disabled topic(s): $disabled_topics" \ "Investigate why these topics are disabled and enable them if needed" \ - "Disabled topics detected" \ + "DISABLED TOPIC ANALYSIS: +- Disabled Topic(s): $disabled_topics +- Count: $disabled_count +- Last Updated: $disabled_at +- Namespace: $SB_NAMESPACE_NAME +- Resource Group: $AZ_RESOURCE_GROUP + +CONTEXT: Disabled topics cannot send or receive messages, which disrupts message flow to all subscriptions and can cause widespread application failures. Topics may be disabled: +1. Manually by administrators during maintenance +2. Automatically by Azure due to policy violations +3. As a result of subscription or namespace suspension +4. Due to quota exhaustions or security concerns + +INVESTIGATION STEPS: +1. Check Azure portal for topic status and any warning messages +2. Review Azure Activity Log for who disabled the topic and when +3. Verify no ongoing maintenance or security incidents +4. Check for any namespace-level issues affecting multiple topics +5. Review application logs for errors around the disabled time ($disabled_at) +6. Identify all subscriptions affected by the disabled topic(s) +7. Verify topic configuration and policies are correct + +RECOMMENDATIONS: +- Re-enable topics if disabled unintentionally +- Document maintenance windows if intentionally disabled +- Implement monitoring alerts for topic status changes +- Review access control to prevent unauthorized modifications +- Verify publisher applications have proper error handling for disabled topics +- Notify all subscriber teams about the topic status + +BUSINESS IMPACT: Disabled topics cause message delivery failures across all subscriptions, widespread application errors, and disrupted business workflows affecting multiple downstream systems. Requires immediate attention." \ "$disabled_at" fi @@ -101,10 +132,57 @@ for topic_name in $(jq -r '.[].name' <<< "$topics"); do size_percent=$(( (size_bytes * 100) / max_size_bytes )) if [[ "$size_percent" -gt "${SIZE_PERCENTAGE_THRESHOLD:-80}" ]]; then + # Get additional context for size analysis + max_size_mb=$(jq -r '.maxSizeInMegabytes' <<< "$topic_details") + message_count=$(jq -r '.countDetails.activeMessageCount // 0' <<< "$topic_details") + scheduled_count=$(jq -r '.countDetails.scheduledMessageCount // 0' <<< "$topic_details") + topic_status=$(jq -r '.status' <<< "$topic_details") + auto_delete_idle=$(jq -r '.autoDeleteOnIdle' <<< "$topic_details") + enable_partitioning=$(jq -r '.enablePartitioning' <<< "$topic_details") + subscription_count=$(jq -r '.subscriptionCount // 0' <<< "$topic_details") + add_issue 3 \ - "Topic \`$topic_name\` is at ${size_percent}% of maximum size" \ + "Topic $topic_name is at ${size_percent}% of maximum size" \ "Consider implementing auto-delete of processed messages or increasing topic size" \ - "Topic approaching size limit: $topic_name ($size_percent%)" + "TOPIC SIZE CAPACITY ANALYSIS: +- Current Size: $size_bytes bytes (${size_percent}% of capacity) +- Maximum Size: $max_size_mb MB ($max_size_bytes bytes) +- Active Message Count: $message_count +- Scheduled Message Count: $scheduled_count +- Topic Name: $topic_name +- Topic Status: $topic_status +- Subscription Count: $subscription_count +- Auto Delete on Idle: $auto_delete_idle +- Partitioning Enabled: $enable_partitioning + +CONTEXT: Topic approaching storage capacity limit indicates that messages are accumulating faster than subscriptions are consuming them. This can lead to: +1. Topic throttling or message rejection when limit is reached +2. Publisher application failures due to inability to send new messages +3. Increased latency in message processing across all subscriptions +4. Potential data loss if messages are rejected +5. Service disruption affecting all subscribers ($subscription_count subscription(s)) + +INVESTIGATION STEPS: +1. Verify all $subscription_count subscription(s) are actively processing messages +2. Check if subscriptions are properly completing/deleting messages after processing +3. Review message retention policies and auto-delete configuration +4. Analyze message size distribution to identify large messages +5. Check subscription dead-letter queues for messages contributing to size +6. Review historical growth patterns to predict when limit will be reached +7. Verify if partitioning is enabled and functioning correctly +8. Check for inactive or abandoned subscriptions + +RECOMMENDATIONS: +- Increase topic maximum size if within namespace quota limits +- Ensure all subscriptions have active consumers processing messages +- Configure auto-delete on idle if appropriate: $auto_delete_idle +- Enable partitioning to increase throughput and capacity (currently: $enable_partitioning) +- Review message TTL settings to automatically expire old messages +- Remove inactive subscriptions that may be holding messages +- Scale out subscription consumers to process backlog faster +- Investigate and address any dead-letter message accumulation + +BUSINESS IMPACT: Reaching topic capacity will cause message rejection, publisher failures, and potential data loss affecting all $subscription_count subscription(s). Immediate action required to prevent service disruption." fi # Get subscriptions for this topic @@ -282,11 +360,52 @@ BUSINESS IMPACT: Message processing delays can lead to degraded user experience, # Check for disabled status status=$(jq -r '.status' <<< "$sub_details") if [[ "$status" == "Disabled" ]]; then + # Get additional context for disabled subscription + updated_at=$(jq -r '.updatedAt' <<< "$sub_details") + max_delivery_count=$(jq -r '.maxDeliveryCount' <<< "$sub_details") + message_count=$(jq -r '.countDetails.activeMessageCount // 0' <<< "$sub_details") + dead_letter_count=$(jq -r '.countDetails.deadLetterMessageCount // 0' <<< "$sub_details") + add_issue 3 \ - "Subscription \`$sub_name\` for topic \`$topic_name\` is disabled" \ + "Subscription $sub_name for topic $topic_name is disabled" \ "Investigate why this subscription is disabled and enable it if needed" \ - "Disabled subscription detected: $topic_name/$sub_name" \ - "$disabled_at" + "DISABLED SUBSCRIPTION ANALYSIS: +- Subscription: $sub_name +- Topic: $topic_name +- Status: Disabled +- Last Updated: $updated_at +- Active Message Count: $message_count +- Dead Letter Message Count: $dead_letter_count +- Max Delivery Count: $max_delivery_count +- Namespace: $SB_NAMESPACE_NAME +- Resource Group: $AZ_RESOURCE_GROUP + +CONTEXT: Disabled subscriptions stop receiving messages from their topic, causing message delivery failures for specific consumers. While the topic continues to function for other subscriptions, this subscription will miss all messages published during the disabled period. Subscriptions may be disabled: +1. Manually by administrators during maintenance or troubleshooting +2. Automatically by Azure due to policy violations or quota issues +3. As part of subscription or namespace-level issues +4. To stop message flow during consumer application updates + +INVESTIGATION STEPS: +1. Check Azure portal for subscription status and any warning messages +2. Review Azure Activity Log for who disabled the subscription and when ($updated_at) +3. Verify consumer application is ready to process messages before re-enabling +4. Check if disabled as part of maintenance or troubleshooting effort +5. Review any accumulated messages ($message_count active, $dead_letter_count dead-letter) +6. Verify subscription configuration and policies are correct +7. Check namespace and topic status for related issues + +RECOMMENDATIONS: +- Re-enable subscription if disabled unintentionally +- Ensure consumer application is healthy before enabling +- Address any message backlog ($message_count messages) that accumulated during downtime +- Review and clear dead-letter queue if needed ($dead_letter_count messages) +- Document maintenance windows if intentionally disabled +- Implement monitoring alerts for subscription status changes +- Consider message recovery strategy for missed messages during disabled period + +BUSINESS IMPACT: Disabled subscription causes message delivery failures for specific consumer applications, leading to data loss, missed events, and disrupted workflows for the affected system. Messages published while disabled cannot be recovered." \ + "$updated_at" fi done done diff --git a/codebundles/azure-subscription-cost-health/analyze_storage_optimization.sh b/codebundles/azure-subscription-cost-health/analyze_storage_optimization.sh index 133f931f9..bf996b076 100755 --- a/codebundles/azure-subscription-cost-health/analyze_storage_optimization.sh +++ b/codebundles/azure-subscription-cost-health/analyze_storage_optimization.sh @@ -140,6 +140,48 @@ get_snapshot_cost_per_gb() { echo "0.05" } +# Get blob storage tier pricing per GB per month (General Purpose v2, LRS, East US) +get_blob_tier_cost_per_gb() { + local tier="$1" + case "${tier^^}" in + HOT) echo "0.0184" ;; + COOL) echo "0.0100" ;; + ARCHIVE) echo "0.00099" ;; + *) echo "0.0184" ;; # Default to Hot + esac +} + +# Get storage account capacity in GB from Azure Monitor metrics +get_storage_account_capacity() { + local account_id="$1" + local lookback_days="${2:-7}" # Default to 7 days for capacity check + + local end_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + local start_time=$(date -u -d "$lookback_days days ago" +"%Y-%m-%dT%H:%M:%SZ") + + # Query UsedCapacity metric + local capacity_data=$(az monitor metrics list \ + --resource "$account_id" \ + --metric "UsedCapacity" \ + --start-time "$start_time" \ + --end-time "$end_time" \ + --interval PT1H \ + --aggregation Average \ + -o json 2>/dev/null || echo '{"value":[]}') + + # Get the most recent average capacity in bytes + local capacity_bytes=$(echo "$capacity_data" | jq -r '.value[0].timeseries[0].data[] | select(.average != null) | .average' 2>/dev/null | tail -n1) + + [[ -z "$capacity_bytes" || "$capacity_bytes" == "null" ]] && capacity_bytes="0" + + # Convert bytes to GB + if [[ "$capacity_bytes" != "0" ]]; then + echo "scale=2; $capacity_bytes / 1073741824" | bc -l + else + echo "0" + fi +} + # Storage account redundancy costs (relative multipliers) get_redundancy_savings() { local current_redundancy="$1" @@ -443,10 +485,13 @@ analyze_lifecycle_policies() { local accounts_without_policy=0 local hot_tier_accounts=0 + local total_capacity_gb=0 + local total_savings=0 local account_details="" while IFS= read -r account_data; do local account_name=$(echo "$account_data" | jq -r '.name') + local account_id=$(echo "$account_data" | jq -r '.id') local account_rg=$(echo "$account_data" | jq -r '.resourceGroup') local account_kind=$(echo "$account_data" | jq -r '.kind') local access_tier=$(echo "$account_data" | jq -r '.accessTier // "N/A"') @@ -469,58 +514,129 @@ analyze_lifecycle_policies() { if [[ "$has_policy" != "true" ]]; then accounts_without_policy=$((accounts_without_policy + 1)) + # Get storage account capacity + progress " Fetching capacity for $account_name..." + local capacity_gb=$(get_storage_account_capacity "$account_id" 7) + + local account_savings="0" + local savings_note="N/A" + + # Count Hot tier accounts regardless of capacity data availability if [[ "$access_tier" == "Hot" ]]; then hot_tier_accounts=$((hot_tier_accounts + 1)) fi + # Calculate potential savings if we have capacity data + if [[ "$access_tier" == "Hot" ]] && (( $(echo "$capacity_gb > 0" | bc -l) )); then + total_capacity_gb=$(echo "scale=2; $total_capacity_gb + $capacity_gb" | bc -l) + + # Conservative estimate: 20% of data could move to Cool, 10% to Archive + local hot_cost_per_gb=$(get_blob_tier_cost_per_gb "HOT") + local cool_cost_per_gb=$(get_blob_tier_cost_per_gb "COOL") + local archive_cost_per_gb=$(get_blob_tier_cost_per_gb "ARCHIVE") + + # Calculate savings from tiering + local cool_savings=$(echo "scale=2; $capacity_gb * 0.20 * ($hot_cost_per_gb - $cool_cost_per_gb)" | bc -l) + local archive_savings=$(echo "scale=2; $capacity_gb * 0.10 * ($hot_cost_per_gb - $archive_cost_per_gb)" | bc -l) + account_savings=$(echo "scale=2; $cool_savings + $archive_savings" | bc -l) + account_savings=$(apply_discount "$account_savings") + + total_savings=$(echo "scale=2; $total_savings + $account_savings" | bc -l) + savings_note="\$$account_savings/month potential" + + log " • $account_name ($account_kind, $access_tier tier, ${capacity_gb}GB) - NO lifecycle policy - \$$account_savings/month potential" + else + if [[ "$access_tier" == "Hot" ]]; then + savings_note="(capacity data unavailable)" + log " • $account_name ($account_kind, $access_tier tier) - NO lifecycle policy - capacity unavailable" + else + savings_note="(non-Hot tier)" + log " • $account_name ($account_kind, $access_tier tier) - NO lifecycle policy" + fi + fi + account_details="${account_details} • $account_name - Resource Group: $account_rg - Kind: $account_kind - Access Tier: $access_tier + - Capacity: ${capacity_gb} GB - Replication: $replication - - Lifecycle Policy: ❌ NOT CONFIGURED" - - log " • $account_name ($account_kind, $access_tier tier) - NO lifecycle policy" + - Lifecycle Policy: ❌ NOT CONFIGURED + - Potential Savings: $savings_note" fi done < <(echo "$storage_accounts" | jq -c '.[]') if [[ $accounts_without_policy -gt 0 ]]; then - # Estimate savings based on Hot tier accounts - # Assume 10% of data could be moved to Cool (60% cheaper) and 10% to Archive (95% cheaper) - # This is a conservative estimate - actual savings depend on data access patterns + local annual_savings="0" + local savings_summary="" + local severity=4 + + if (( $(echo "$total_savings > 0" | bc -l) )); then + annual_savings=$(echo "scale=2; $total_savings * 12" | bc -l) + severity=$(get_severity_for_savings "$total_savings") + + savings_summary=" +COST ANALYSIS (Hot Tier Accounts with Capacity Data): +Total Capacity Analyzed: ${total_capacity_gb} GB +Estimated Monthly Savings: \$$total_savings +Estimated Annual Savings: \$$annual_savings + +SAVINGS METHODOLOGY: +- Conservative estimate: 20% of data → Cool tier (46% cheaper) +- Conservative estimate: 10% of data → Archive tier (95% cheaper) +- Based on actual storage capacity from Azure Monitor metrics +- Actual savings depend on data access patterns + +" + else + if [[ $hot_tier_accounts -gt 0 ]]; then + severity=3 + savings_summary=" +NOTE: Unable to calculate exact savings - storage capacity metrics unavailable. +Hot tier accounts found: $hot_tier_accounts (these have highest savings potential) + +POTENTIAL SAVINGS (Tier Comparison): +- Hot → Cool: ~46% savings on storage costs +- Hot → Archive: ~95% savings on storage costs +- Example: 1TB Hot (\$18.40/month) → Cool (\$10/month) → Archive (\$0.99/month) + +" + else + savings_summary=" +NOTE: No Hot tier accounts found. Lifecycle policies still recommended for: +- Automatic cleanup of old blob versions +- Deletion of soft-deleted items +- Moving Cool data to Archive after extended periods + +" + fi + fi local details="STORAGE ACCOUNTS WITHOUT LIFECYCLE MANAGEMENT: Subscription: $subscription_name ($subscription_id) Storage Accounts Without Policies: $accounts_without_policy -Hot Tier Accounts (highest savings potential): $hot_tier_accounts +Hot Tier Accounts: $hot_tier_accounts ACCOUNTS WITHOUT LIFECYCLE POLICIES: $account_details +${savings_summary} ISSUE: These storage accounts don't have lifecycle management policies configured. Without lifecycle policies: - All data stays in its original tier forever - Old/inactive data continues at Hot tier pricing - No automatic cleanup of old versions or deleted blobs - -POTENTIAL SAVINGS (Tier Comparison): -- Hot → Cool: ~60% savings on storage costs -- Hot → Archive: ~95% savings on storage costs -- Example: 1TB Hot (\$20/month) → Archive (\$1/month) +- Manual intervention required for cost optimization RECOMMENDATION: Configure lifecycle management policies to: 1. Move data to Cool tier after 30-90 days of no access 2. Move data to Archive tier after 180+ days -3. Delete old blob versions and soft-deleted items" - - local severity=4 - if [[ $hot_tier_accounts -ge 3 ]]; then - severity=3 - fi +3. Delete old blob versions and soft-deleted items +4. Enable soft-delete cleanup for aged data" local next_steps="ACTIONS - Configure Lifecycle Management Policies: @@ -560,7 +676,12 @@ az storage account management-policy create \\ ⚠️ NOTE: Archive tier has retrieval costs and latency. Ensure data access patterns support archival." - add_issue "Storage Lifecycle: $accounts_without_policy account(s) without lifecycle policies" "$details" "$severity" "$next_steps" + local issue_title="Storage Lifecycle: $accounts_without_policy account(s) without lifecycle policies" + if (( $(echo "$total_savings > 0" | bc -l) )); then + issue_title="Storage Lifecycle: $accounts_without_policy account(s) - \$$total_savings/month potential savings" + fi + + add_issue "$issue_title" "$details" "$severity" "$next_steps" else progress " ✓ All storage accounts have lifecycle policies configured" log " ✓ All storage accounts have lifecycle management policies" @@ -590,10 +711,13 @@ analyze_redundancy() { log " Found $account_count geo-redundant storage account(s):" log "" + local total_capacity_gb=0 + local total_savings=0 local account_details="" while IFS= read -r account_data; do local account_name=$(echo "$account_data" | jq -r '.name') + local account_id=$(echo "$account_data" | jq -r '.id') local account_rg=$(echo "$account_data" | jq -r '.resourceGroup') local account_kind=$(echo "$account_data" | jq -r '.kind') local sku_name=$(echo "$account_data" | jq -r '.sku.name') @@ -602,17 +726,88 @@ analyze_redundancy() { local savings_pct=$(get_redundancy_savings "$replication" "LRS") + # Get storage account capacity + progress " Fetching capacity for $account_name..." + local capacity_gb=$(get_storage_account_capacity "$account_id" 7) + + local account_savings="0" + local savings_note="~${savings_pct}%" + + # Calculate actual dollar savings if we have capacity data + if (( $(echo "$capacity_gb > 0" | bc -l) )); then + total_capacity_gb=$(echo "scale=2; $total_capacity_gb + $capacity_gb" | bc -l) + + # Get the tier pricing for current access tier (default to Hot if N/A) + local tier="${access_tier}" + [[ "$tier" == "N/A" ]] && tier="Hot" + local base_cost_per_gb=$(get_blob_tier_cost_per_gb "$tier") + + # Calculate current cost with redundancy multiplier + local current_mult=1.0 + case "${replication^^}" in + LRS) current_mult=1.0 ;; + ZRS) current_mult=1.25 ;; + GRS) current_mult=2.0 ;; + GZRS) current_mult=2.5 ;; + RAGRS|RA-GRS) current_mult=2.1 ;; + RAGZRS|RA-GZRS) current_mult=2.6 ;; + esac + + local current_monthly_cost=$(echo "scale=2; $capacity_gb * $base_cost_per_gb * $current_mult" | bc -l) + local lrs_monthly_cost=$(echo "scale=2; $capacity_gb * $base_cost_per_gb * 1.0" | bc -l) + account_savings=$(echo "scale=2; $current_monthly_cost - $lrs_monthly_cost" | bc -l) + account_savings=$(apply_discount "$account_savings") + + total_savings=$(echo "scale=2; $total_savings + $account_savings" | bc -l) + savings_note="\$$account_savings/month (~${savings_pct}%)" + + log " • $account_name ($replication, ${capacity_gb}GB) - \$$account_savings/month savings potential with LRS" + else + log " • $account_name ($replication) - ~${savings_pct}% savings potential with LRS (capacity unavailable)" + savings_note="~${savings_pct}% (capacity unavailable)" + fi + account_details="${account_details} • $account_name - Resource Group: $account_rg - Kind: $account_kind + - Access Tier: $access_tier + - Capacity: ${capacity_gb} GB - Current SKU: $sku_name - Replication: $replication - - Potential Savings: ~${savings_pct}% if switched to LRS" - - log " • $account_name ($replication) - ~${savings_pct}% savings potential with LRS" + - Potential Savings with LRS: $savings_note" done < <(echo "$storage_accounts" | jq -c '.[]') + local annual_savings="0" + local savings_summary="" + local severity=4 + + if (( $(echo "$total_savings > 0" | bc -l) )); then + annual_savings=$(echo "scale=2; $total_savings * 12" | bc -l) + severity=$(get_severity_for_savings "$total_savings") + + savings_summary=" +COST ANALYSIS (Accounts with Capacity Data): +Total Capacity Analyzed: ${total_capacity_gb} GB +Potential Monthly Savings (if switched to LRS): \$$total_savings +Potential Annual Savings: \$$annual_savings + +SAVINGS CALCULATION: +- Based on actual storage capacity from Azure Monitor metrics +- Calculated using redundancy multipliers relative to LRS baseline +- Assumes complete switch to LRS (review business requirements first!) + +" + else + if [[ $account_count -ge 5 ]]; then + severity=3 + fi + savings_summary=" +NOTE: Unable to calculate exact savings - storage capacity metrics unavailable. + +" + fi + local details="GEO-REDUNDANT STORAGE ACCOUNTS - REVIEW FOR COST SAVINGS: Subscription: $subscription_name ($subscription_id) @@ -621,13 +816,19 @@ Geo-Redundant Accounts: $account_count ACCOUNTS WITH GEO-REDUNDANCY: $account_details +${savings_summary} REDUNDANCY COST COMPARISON (relative to LRS): - LRS (Locally Redundant): 1.0x (baseline) -- ZRS (Zone Redundant): ~1.25x -- GRS (Geo-Redundant): ~2.0x -- RA-GRS (Read-Access Geo): ~2.1x -- GZRS (Geo-Zone Redundant): ~2.5x -- RA-GZRS (Read-Access Geo-Zone): ~2.6x +- ZRS (Zone Redundant): ~1.25x (+25%) +- GRS (Geo-Redundant): ~2.0x (+100%) +- RA-GRS (Read-Access Geo): ~2.1x (+110%) +- GZRS (Geo-Zone Redundant): ~2.5x (+150%) +- RA-GZRS (Read-Access Geo-Zone): ~2.6x (+160%) + +Example: 1TB storage +- LRS: \$18.40/month +- GRS: \$36.80/month (double the cost) +- Savings if GRS→LRS: \$18.40/month per TB EVALUATION CRITERIA: Consider downgrading to LRS or ZRS if: @@ -640,11 +841,6 @@ Keep GRS/GZRS if: ✗ Regulatory compliance requires geo-redundancy ✗ Data is irreplaceable and mission-critical ✗ Business continuity requires cross-region failover" - - local severity=4 - if [[ $account_count -ge 5 ]]; then - severity=3 - fi local next_steps="ACTIONS - Review and Optimize Storage Redundancy: @@ -669,7 +865,12 @@ RECOMMENDED APPROACH: 4. Implement for dev/test accounts first 5. Document compliance requirements before changing prod" - add_issue "Geo-Redundant Storage: $account_count account(s) - Review for potential ~50% savings" "$details" "$severity" "$next_steps" + local issue_title="Geo-Redundant Storage: $account_count account(s) - Review for potential savings" + if (( $(echo "$total_savings > 0" | bc -l) )); then + issue_title="Geo-Redundant Storage: $account_count account(s) - \$$total_savings/month potential (switch to LRS)" + fi + + add_issue "$issue_title" "$details" "$severity" "$next_steps" } # Analysis 5: Find Premium disks with low utilization diff --git a/codebundles/azure-subscription-cost-health/analyze_vm_optimization.sh b/codebundles/azure-subscription-cost-health/analyze_vm_optimization.sh index b40b151b6..d63d7178b 100755 --- a/codebundles/azure-subscription-cost-health/analyze_vm_optimization.sh +++ b/codebundles/azure-subscription-cost-health/analyze_vm_optimization.sh @@ -418,6 +418,7 @@ Note: Deallocation releases compute resources but keeps disks. VM can be restart [[ -z "$avg_memory" || ! "$avg_memory" =~ ^[0-9.]+$ ]] && avg_memory="0" [[ -z "$max_memory" || ! "$max_memory" =~ ^[0-9.]+$ ]] && max_memory="0" + log " VM: $vm_name" log " Average CPU: ${avg_cpu}%" log " Peak CPU: ${max_cpu}%" log " Average Memory: ${avg_memory}%" @@ -429,13 +430,13 @@ Note: Deallocation releases compute resources but keeps disks. VM can be restart memory_available=false log " ⚠️ Memory metrics unavailable - Azure Monitor Agent (AMA) or VM Insights required" log " To enable: Install Azure Monitor Agent or enable VM Insights in Azure Portal" - log " Proceeding with CPU-only analysis..." - progress " ℹ️ Memory metrics unavailable - using CPU-only analysis" + log " Proceeding with CPU-only analysis (using average CPU as primary metric)..." + progress " ℹ️ Memory metrics unavailable - using CPU-only analysis (average-based)" fi # Check if VM is underutilized # If memory metrics available: require BOTH CPU and Memory to be underutilized - # If memory metrics unavailable: use CPU-only with a warning flag + # If memory metrics unavailable: use CPU-only with AVERAGE as primary metric local is_underutilized=false local analysis_type="" @@ -448,19 +449,25 @@ Note: Deallocation releases compute resources but keeps disks. VM can be restart fi else # CPU-only analysis (memory unavailable) - # Use a more conservative threshold for CPU-only (20% instead of 30%) - local cpu_only_threshold=20 - if (( $(echo "$max_cpu > 0 && $max_cpu < $cpu_only_threshold" | bc -l 2>/dev/null || echo "0") )); then + # Strategy: Use AVERAGE CPU as primary indicator with reasonable PEAK allowance + # This catches VMs that are mostly idle but can burst (perfect for B-series!) + # Thresholds: avg_cpu < 15% AND peak_cpu < 70% + if (( $(echo "$avg_cpu > 0 && $avg_cpu < 15" | bc -l 2>/dev/null || echo "0") )) && \ + (( $(echo "$max_cpu > 0 && $max_cpu < 70" | bc -l 2>/dev/null || echo "0") )); then is_underutilized=true - analysis_type="CPU-only" + analysis_type="CPU-only (avg-based)" + log " 💡 Underutilization detected: Avg CPU ${avg_cpu}% < 15% and Peak CPU ${max_cpu}% < 70%" + progress " ⚠️ VM appears underutilized - mostly idle with occasional bursts (good B-series candidate)" fi fi if [[ "$is_underutilized" == "true" ]]; then - if [[ "$analysis_type" == "CPU-only" ]]; then - progress " ⚠️ VM is underutilized [${analysis_type}] (peak CPU: ${max_cpu}%)" + if [[ "$analysis_type" == "CPU-only (avg-based)" ]]; then + progress " 🎯 RECOMMENDATION: $vm_name is underutilized [${analysis_type}]" + progress " Avg CPU: ${avg_cpu}% | Peak CPU: ${max_cpu}% | Memory: N/A" else - progress " ⚠️ VM is underutilized [${analysis_type}] (peak CPU: ${max_cpu}%, peak Memory: ${max_memory}%)" + progress " 🎯 RECOMMENDATION: $vm_name is underutilized [${analysis_type}]" + progress " Avg CPU: ${avg_cpu}% | Peak CPU: ${max_cpu}% | Peak Memory: ${max_memory}%" fi # Suggest smaller VM size @@ -501,31 +508,44 @@ Note: Deallocation releases compute resources but keeps disks. VM can be restart # Only report if savings are significant (>$50/month) if (( $(echo "$savings > 50" | bc -l) )); then + progress " 💰 Potential Savings: \$$savings/month by switching to $suggested_vm" local annual_savings=$(echo "scale=2; $savings * 12" | bc -l) local severity=$(get_severity_for_savings "$savings") local utilization_section="" local rationale_section="" - if [[ "$analysis_type" == "CPU-only" ]]; then - utilization_section="UTILIZATION ANALYSIS (${LOOKBACK_DAYS} days) [CPU-ONLY]: -- Average CPU: ${avg_cpu}% -- Peak CPU: ${max_cpu}% (threshold: 20% for CPU-only) + if [[ "$analysis_type" == "CPU-only (avg-based)" ]]; then + utilization_section="UTILIZATION ANALYSIS (${LOOKBACK_DAYS} days) [CPU-ONLY - AVERAGE-BASED]: +- VM Name: ${vm_name} +- Average CPU: ${avg_cpu}% ← Primary metric (threshold: <15%) +- Peak CPU: ${max_cpu}% (threshold: <70%) - Memory: ⚠️ Not available (Azure Monitor Agent required) +ANALYSIS APPROACH: +Since memory metrics are unavailable, we use AVERAGE CPU as the primary indicator. +A VM averaging <15% CPU with peaks <70% is mostly idle with occasional bursts. +This is the ideal use case for B-series burstable VMs. + NOTE: This recommendation is based on CPU metrics only. Memory metrics require Azure Monitor Agent or VM Insights to be enabled. -Please verify memory utilization manually before resizing." - rationale_section="CPU utilization (peak: ${max_cpu}%) is well below the conservative 20% threshold. +⚠️ Please verify memory utilization manually before resizing to ensure it's not constrained." + rationale_section="Average CPU (${avg_cpu}%) is very low, indicating the VM is mostly idle. +Peak CPU (${max_cpu}%) shows occasional bursts, which B-series handles perfectly. ⚠️ CAUTION: Memory utilization was not analyzed. Verify memory is not constrained before resizing. -B-series VMs provide burst capacity for occasional spikes while saving costs." +B-series VMs provide burst capacity for occasional spikes while saving costs on idle time." else - utilization_section="UTILIZATION ANALYSIS (${LOOKBACK_DAYS} days): + utilization_section="UTILIZATION ANALYSIS (${LOOKBACK_DAYS} days) [FULL - CPU+MEMORY]: +- VM Name: ${vm_name} - Average CPU: ${avg_cpu}% - Peak CPU: ${max_cpu}% (threshold: ${CPU_UNDERUTILIZATION_THRESHOLD}%) - Average Memory: ${avg_memory}% -- Peak Memory: ${max_memory}% (threshold: ${MEMORY_UNDERUTILIZATION_THRESHOLD}%)" +- Peak Memory: ${max_memory}% (threshold: ${MEMORY_UNDERUTILIZATION_THRESHOLD}%) + +ANALYSIS APPROACH: +Both CPU and Memory metrics are available. VM is underutilized across both dimensions." rationale_section="Both CPU (peak: ${max_cpu}%) and Memory (peak: ${max_memory}%) are well below thresholds. +This VM is consistently underutilized and is a strong candidate for downsizing. B-series VMs provide burst capacity for occasional spikes while saving costs." fi diff --git a/codebundles/azure-subscription-cost-health/runbook.robot b/codebundles/azure-subscription-cost-health/runbook.robot index 5658061c3..abb12a9c6 100644 --- a/codebundles/azure-subscription-cost-health/runbook.robot +++ b/codebundles/azure-subscription-cost-health/runbook.robot @@ -21,7 +21,7 @@ Generate Azure Cost Report By Service and Resource Group ${cost_report}= RW.CLI.Run Bash File ... bash_file=azure_cost_historical_report.sh ... env=${env} - ... timeout_seconds=600 + ... timeout_seconds=900 ... include_in_history=false ... show_in_rwl_cheatsheet=true RW.Core.Add Pre To Report ${cost_report.stdout} @@ -31,7 +31,7 @@ Generate Azure Cost Report By Service and Resource Group ${trend_issues}= RW.CLI.Run Cli ... cmd=cat azure_cost_trend_issues.json ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false ${trend_issue_list}= Evaluate json.loads(r'''${trend_issues.stdout}''') json IF len(@{trend_issue_list}) > 0 @@ -53,7 +53,7 @@ Analyze App Service Plan Cost Optimization ${cost_analysis}= RW.CLI.Run Bash File ... bash_file=azure_appservice_cost_optimization.sh ... env=${env} - ... timeout_seconds=600 + ... timeout_seconds=900 ... include_in_history=false ... show_in_rwl_cheatsheet=true RW.Core.Add Pre To Report ${cost_analysis.stdout} @@ -62,7 +62,7 @@ Analyze App Service Plan Cost Optimization ${summary_cmd}= RW.CLI.Run Cli ... cmd=if [ -f "azure_appservice_cost_optimization_issues.json" ]; then echo "Cost Health Analysis Summary:"; echo "============================"; jq -r 'group_by(.severity) | map({severity: .[0].severity, count: length}) | sort_by(.severity) | .[] | "Severity \\(.severity): \\(.count) issue(s)"' azure_appservice_cost_optimization_issues.json; echo ""; echo "Top Cost Savings Opportunities:"; jq -r 'sort_by(.severity) | limit(5; .[]) | "- \\(.title)"' azure_appservice_cost_optimization_issues.json; else echo "No cost analysis data available"; fi ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false RW.Core.Add Pre To Report ${summary_cmd.stdout} @@ -71,7 +71,7 @@ Analyze App Service Plan Cost Optimization ${savings_summary}= RW.CLI.Run Cli ... cmd=if [ -f "azure_appservice_cost_optimization_report.txt" ]; then echo ""; echo "Detailed Analysis Report:"; echo "========================"; tail -20 azure_appservice_cost_optimization_report.txt; else echo "No detailed report available"; fi ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false RW.Core.Add Pre To Report ${savings_summary.stdout} @@ -79,7 +79,7 @@ Analyze App Service Plan Cost Optimization ${issues}= RW.CLI.Run Cli ... cmd=cat azure_appservice_cost_optimization_issues.json ... env=${env} - ... timeout_seconds=60 + ... timeout_seconds=900 ... include_in_history=false ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json IF len(@{issue_list}) > 0 @@ -112,7 +112,7 @@ Analyze AKS Node Pool Resizing Opportunities Based on Utilization Metrics ${aks_summary_cmd}= RW.CLI.Run Cli ... cmd=if [ -f "aks_node_pool_optimization_issues.json" ]; then echo "AKS Node Pool Optimization Summary:"; echo "===================================="; jq -r 'group_by(.severity) | map({severity: .[0].severity, count: length}) | sort_by(.severity) | .[] | "Severity \\(.severity): \\(.count) issue(s)"' aks_node_pool_optimization_issues.json; echo ""; echo "Top Optimization Opportunities:"; jq -r 'sort_by(.severity) | limit(5; .[]) | "- \\(.title)"' aks_node_pool_optimization_issues.json; else echo "No AKS optimization data available"; fi ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false RW.Core.Add Pre To Report ${aks_summary_cmd.stdout} @@ -121,7 +121,7 @@ Analyze AKS Node Pool Resizing Opportunities Based on Utilization Metrics ${aks_details}= RW.CLI.Run Cli ... cmd=if [ -f "aks_node_pool_optimization_report.txt" ]; then echo ""; echo "Detailed AKS Optimization Report:"; echo "=================================="; tail -30 aks_node_pool_optimization_report.txt; else echo "No detailed AKS report available"; fi ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false RW.Core.Add Pre To Report ${aks_details.stdout} @@ -129,7 +129,7 @@ Analyze AKS Node Pool Resizing Opportunities Based on Utilization Metrics ${aks_issues}= RW.CLI.Run Cli ... cmd=cat aks_node_pool_optimization_issues.json ... env=${env} - ... timeout_seconds=60 + ... timeout_seconds=900 ... include_in_history=false ${aks_issue_list}= Evaluate json.loads(r'''${aks_issues.stdout}''') json IF len(@{aks_issue_list}) > 0 @@ -162,7 +162,7 @@ Analyze Databricks Cluster Auto-Termination and Over-Provisioning Opportunities ${databricks_summary_cmd}= RW.CLI.Run Cli ... cmd=if [ -f "databricks_cluster_optimization_issues.json" ]; then echo "Databricks Cluster Optimization Summary:"; echo "========================================="; jq -r 'group_by(.severity) | map({severity: .[0].severity, count: length}) | sort_by(.severity) | .[] | "Severity \\(.severity): \\(.count) issue(s)"' databricks_cluster_optimization_issues.json; echo ""; echo "Top Optimization Opportunities:"; jq -r 'sort_by(.severity) | limit(5; .[]) | "- \\(.title)"' databricks_cluster_optimization_issues.json; else echo "No Databricks optimization data available"; fi ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false RW.Core.Add Pre To Report ${databricks_summary_cmd.stdout} @@ -171,7 +171,7 @@ Analyze Databricks Cluster Auto-Termination and Over-Provisioning Opportunities ${databricks_details}= RW.CLI.Run Cli ... cmd=if [ -f "databricks_cluster_optimization_report.txt" ]; then echo ""; echo "Detailed Databricks Optimization Report:"; echo "========================================"; tail -30 databricks_cluster_optimization_report.txt; else echo "No detailed Databricks report available"; fi ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false RW.Core.Add Pre To Report ${databricks_details.stdout} @@ -179,7 +179,7 @@ Analyze Databricks Cluster Auto-Termination and Over-Provisioning Opportunities ${databricks_issues}= RW.CLI.Run Cli ... cmd=cat databricks_cluster_optimization_issues.json ... env=${env} - ... timeout_seconds=60 + ... timeout_seconds=900 ... include_in_history=false ${databricks_issue_list}= Evaluate json.loads(r'''${databricks_issues.stdout}''') json IF len(@{databricks_issue_list}) > 0 @@ -212,7 +212,7 @@ Analyze Virtual Machine Rightsizing and Deallocation Opportunities ${vm_summary_cmd}= RW.CLI.Run Cli ... cmd=if [ -f "vm_optimization_issues.json" ]; then echo "Virtual Machine Optimization Summary:"; echo "====================================="; jq -r 'group_by(.severity) | map({severity: .[0].severity, count: length}) | sort_by(.severity) | .[] | "Severity \\(.severity): \\(.count) issue(s)"' vm_optimization_issues.json; echo ""; echo "Top Optimization Opportunities:"; jq -r 'sort_by(.severity) | limit(5; .[]) | "- \\(.title)"' vm_optimization_issues.json; else echo "No VM optimization data available"; fi ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false RW.Core.Add Pre To Report ${vm_summary_cmd.stdout} @@ -221,7 +221,7 @@ Analyze Virtual Machine Rightsizing and Deallocation Opportunities ${vm_details}= RW.CLI.Run Cli ... cmd=if [ -f "vm_optimization_report.txt" ]; then echo ""; echo "Detailed VM Optimization Report:"; echo "================================"; tail -30 vm_optimization_report.txt; else echo "No detailed VM report available"; fi ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false RW.Core.Add Pre To Report ${vm_details.stdout} @@ -229,7 +229,7 @@ Analyze Virtual Machine Rightsizing and Deallocation Opportunities ${vm_issues}= RW.CLI.Run Cli ... cmd=cat vm_optimization_issues.json ... env=${env} - ... timeout_seconds=60 + ... timeout_seconds=900 ... include_in_history=false ${vm_issue_list}= Evaluate json.loads(r'''${vm_issues.stdout}''') json IF len(@{vm_issue_list}) > 0 @@ -263,7 +263,7 @@ Analyze Virtual Machine Rightsizing and Deallocation Opportunities ${consolidated_next_steps}= RW.CLI.Run Cli ... cmd=echo "PRIORITIZED ACTION PLAN:"; echo ""; echo "1. Review all ${issue_count} VM recommendations above"; echo "2. Start with highest-savings VMs first"; echo "3. For each VM:"; echo "a. Verify current utilization matches analysis"; echo "b. Test resize in dev/test first if available"; echo "c. Execute resize command during maintenance window"; echo "d. Monitor for 24-48 hours post-resize"; echo ""; echo "NOTE: All B-series recommendations are burstable instances."; echo "They provide baseline performance with ability to burst to 100% CPU when needed."; echo "Ideal for workloads with low average CPU but occasional spikes." ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false # Determine severity based on total savings @@ -299,7 +299,7 @@ Analyze Azure Storage Cost Optimization Opportunities ${storage_summary_cmd}= RW.CLI.Run Cli ... cmd=if [ -f "storage_optimization_issues.json" ]; then echo "Storage Cost Optimization Summary:"; echo "==================================="; jq -r 'group_by(.severity) | map({severity: .[0].severity, count: length}) | sort_by(.severity) | .[] | "Severity \\(.severity): \\(.count) issue(s)"' storage_optimization_issues.json; echo ""; echo "Top Optimization Opportunities:"; jq -r 'sort_by(.severity) | limit(5; .[]) | "- \\(.title)"' storage_optimization_issues.json; else echo "No storage optimization data available"; fi ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false RW.Core.Add Pre To Report ${storage_summary_cmd.stdout} @@ -308,7 +308,7 @@ Analyze Azure Storage Cost Optimization Opportunities ${storage_details}= RW.CLI.Run Cli ... cmd=if [ -f "storage_optimization_report.txt" ]; then echo ""; echo "Detailed Storage Optimization Report:"; echo "====================================="; tail -40 storage_optimization_report.txt; else echo "No detailed storage report available"; fi ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false RW.Core.Add Pre To Report ${storage_details.stdout} @@ -316,7 +316,7 @@ Analyze Azure Storage Cost Optimization Opportunities ${storage_issues}= RW.CLI.Run Cli ... cmd=cat storage_optimization_issues.json ... env=${env} - ... timeout_seconds=60 + ... timeout_seconds=900 ... include_in_history=false ${storage_issue_list}= Evaluate json.loads(r'''${storage_issues.stdout}''') json IF len(@{storage_issue_list}) > 0 @@ -429,7 +429,7 @@ Suite Initialization ${auth_check}= RW.CLI.Run Cli ... cmd=az account show --query "{subscriptionId: id, subscriptionName: name, tenantId: tenantId, user: user.name}" -o table ... env=${env} - ... timeout_seconds=300 + ... timeout_seconds=900 ... include_in_history=false Log Current Azure Context: ${auth_check.stdout} @@ -438,7 +438,7 @@ Suite Initialization ${subscription_validation}= RW.CLI.Run Cli ... cmd=if [ -n "$AZURE_SUBSCRIPTION_IDS" ]; then echo "Validating access to target subscriptions:"; for sub_id in $(echo "$AZURE_SUBSCRIPTION_IDS" | tr ',' ' '); do echo "Checking subscription: $sub_id"; az account show --subscription "$sub_id" --query "{id: id, name: name, state: state}" -o table 2>/dev/null || echo "❌ Cannot access subscription: $sub_id"; done; else echo "Using current subscription context"; az account show --query "{id: id, name: name, state: state}" -o table; fi ... env=${env} - ... timeout_seconds=60 + ... timeout_seconds=900 ... include_in_history=false Log Subscription Access Validation: ${subscription_validation.stdout} @@ -447,7 +447,7 @@ Suite Initialization ${permissions_check}= RW.CLI.Run Cli ... cmd=echo "Checking required permissions:"; echo "- App Service Plans: $(az provider show --namespace Microsoft.Web --query "registrationState" -o tsv 2>/dev/null || echo 'Not available')"; echo "- Function Apps: $(az functionapp list --query "length(@)" -o tsv 2>/dev/null && echo 'Access granted' || echo 'Access denied')"; echo "- Resource Groups: $(az group list --query "length(@)" -o tsv 2>/dev/null && echo 'Access granted' || echo 'Access denied')"; echo "- Monitor Metrics: $(az provider show --namespace Microsoft.Insights --query "registrationState" -o tsv 2>/dev/null || echo 'Not available')" ... env=${env} - ... timeout_seconds=60 + ... timeout_seconds=900 ... include_in_history=false Log Permission Check Results: ${permissions_check.stdout}