Skip to content

Commit 35dc483

Browse files
author
Fabrício Duarte
committed
Merge branch 'improve-vr-health-check-fail-email-alert-and-logs' into '4.20.0.0-scclouds'
Reescrita de alerta de email sobre _health checks_ do VR Closes #2074 See merge request scclouds/scclouds!1202
2 parents 6282083 + f700bb6 commit 35dc483

2 files changed

Lines changed: 26 additions & 18 deletions

File tree

engine/schema/src/main/java/com/cloud/network/dao/RouterHealthCheckResultDaoImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ public RouterHealthCheckResultVO getRouterHealthCheckResult(long routerId, Strin
6464
SearchCriteria<RouterHealthCheckResultVO> sc = RouterChecksSearchBuilder.create();
6565
sc.setParameters("routerId", routerId);
6666
sc.setParameters("checkName", checkName);
67-
sc.setParameters("checkType", checkType);
67+
sc.setParametersIfNotNull("checkType", checkType);
6868
List<RouterHealthCheckResultVO> checks = listBy(sc);
6969
if (checks.size() > 1) {
7070
logger.error("Found multiple entries for router Id: " + routerId + ", check name: " + checkName);

server/src/main/java/com/cloud/network/router/VirtualNetworkApplianceManagerImpl.java

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050
import javax.naming.ConfigurationException;
5151

5252
import org.apache.cloudstack.acl.ApiKeyPairVO;
53-
import org.apache.cloudstack.alert.AlertService;
5453
import org.apache.cloudstack.alert.AlertService.AlertType;
5554
import org.apache.cloudstack.api.ApiCommandResourceType;
5655
import org.apache.cloudstack.api.command.admin.router.RebootRouterCmd;
@@ -871,7 +870,7 @@ protected void updateSite2SiteVpnConnectionState(final List<DomainRouterVO> rout
871870
"Site-to-site Vpn Connection to " + gw.getName() + " on router " + router.getHostName() + "(id: " + router.getId() + ") " +
872871
" just switched from " + oldState + " to " + conn.getState();
873872
logger.info(context);
874-
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_DOMAIN_ROUTER, router.getDataCenterId(), router.getPodIdToDeployIn(), title, context);
873+
_alertMgr.sendAlert(AlertType.ALERT_TYPE_DOMAIN_ROUTER, router.getDataCenterId(), router.getPodIdToDeployIn(), title, context);
875874
}
876875
}
877876
} finally {
@@ -933,7 +932,7 @@ protected void updateRoutersRedundantState(final List<DomainRouterVO> routers) {
933932
+ currState;
934933
logger.info(context);
935934
if (currState == RedundantState.PRIMARY) {
936-
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_DOMAIN_ROUTER, router.getDataCenterId(), router.getPodIdToDeployIn(), title, context);
935+
_alertMgr.sendAlert(AlertType.ALERT_TYPE_DOMAIN_ROUTER, router.getDataCenterId(), router.getPodIdToDeployIn(), title, context);
937936
}
938937
}
939938
}
@@ -950,7 +949,7 @@ protected void recoverRedundantNetwork(final DomainRouterVO primaryRouter, final
950949
if (logger.isDebugEnabled()) {
951950
logger.debug(title);
952951
}
953-
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_DOMAIN_ROUTER, backupRouter.getDataCenterId(), backupRouter.getPodIdToDeployIn(), title, title);
952+
_alertMgr.sendAlert(AlertType.ALERT_TYPE_DOMAIN_ROUTER, backupRouter.getDataCenterId(), backupRouter.getPodIdToDeployIn(), title, title);
954953
try {
955954
rebootRouter(backupRouter.getId(), true, false);
956955
} catch (final ConcurrentOperationException e) {
@@ -1039,7 +1038,7 @@ private void checkDuplicatePrimary(final List<DomainRouterVO> routers) {
10391038
+ dupRouter.getHostName();
10401039
final String context = "Virtual router (name: " + router.getHostName() + ", id: " + router.getId() + " and router (name: " + dupRouter.getHostName()
10411040
+ ", id: " + router.getId() + ") are both in PRIMARY state! If the problem persist, restart both of routers. ";
1042-
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_DOMAIN_ROUTER, router.getDataCenterId(), router.getPodIdToDeployIn(), title, context);
1041+
_alertMgr.sendAlert(AlertType.ALERT_TYPE_DOMAIN_ROUTER, router.getDataCenterId(), router.getPodIdToDeployIn(), title, context);
10431042
logger.warn(context);
10441043
} else {
10451044
networkRouterMaps.put(routerGuestNtwkId, router);
@@ -1203,10 +1202,20 @@ private void handleFailingChecks(DomainRouterVO router, List<String> failingChec
12031202
if (failingChecks == null || failingChecks.size() == 0) {
12041203
return;
12051204
}
1205+
String subject = String.format("Failed health checks on router [%s]", router.getName());
1206+
String alertMessage = String.format("The following health checks have failed on router [%s] with UUID [%s]: ", router.getName(), router.getUuid());
1207+
ArrayList<String> failedChecks = new ArrayList<>();
1208+
for (String failedCheckName : failingChecks) {
1209+
RouterHealthCheckResultVO routerHealthCheckResultVO = routerHealthCheckResultDao.getRouterHealthCheckResult(router.getId(), failedCheckName, null);
1210+
String failedCheckDetails = routerHealthCheckResultVO.getParsedCheckDetails();
1211+
failedCheckDetails = failedCheckDetails.replace("\n", " ");
1212+
String failedCheck = failedCheckName + ": " + failedCheckDetails;
1213+
failedChecks.add(failedCheck);
1214+
}
1215+
alertMessage = alertMessage + failedChecks;
12061216

1207-
String alertMessage = String.format("Health checks failed: %d failing checks on router %s / %s", failingChecks.size(), router.getName(), router.getUuid());
12081217
_alertMgr.sendAlert(AlertType.ALERT_TYPE_DOMAIN_ROUTER, router.getDataCenterId(), router.getPodIdToDeployIn(),
1209-
alertMessage, alertMessage);
1218+
subject, alertMessage);
12101219
logger.warn(alertMessage + ". Checking failed health checks to see if router needs recreate");
12111220

12121221
String checkFailsToRecreateVr = RouterHealthChecksFailuresToRecreateVr.valueIn(router.getDataCenterId());
@@ -1215,11 +1224,11 @@ private void handleFailingChecks(DomainRouterVO router, List<String> failingChec
12151224
for (int i = 0; i < failingChecks.size(); i++) {
12161225
String failedCheck = failingChecks.get(i);
12171226
if (i == 0) {
1218-
failingChecksEvent.append("Router ")
1227+
failingChecksEvent.append("Router [")
12191228
.append(router.getName())
1220-
.append(" / ")
1229+
.append("] with UUID [")
12211230
.append(router.getUuid())
1222-
.append(" has failing checks: ");
1231+
.append("] has failing checks: ");
12231232
}
12241233

12251234
failingChecksEvent.append(failedCheck);
@@ -1236,8 +1245,7 @@ private void handleFailingChecks(DomainRouterVO router, List<String> failingChec
12361245
Domain.ROOT_DOMAIN, EventTypes.EVENT_ROUTER_HEALTH_CHECKS, failingChecksEvent.toString(), router.getId(), ApiCommandResourceType.DomainRouter.toString());
12371246

12381247
if (recreateRouter) {
1239-
logger.warn("Health Check Alert: Found failing checks in " +
1240-
RouterHealthChecksFailuresToRecreateVrCK + ", attempting recreating router.");
1248+
logger.warn("Health Check Alert: Found failing checks in [{}], attempting to recreate router with id [{}].", RouterHealthChecksFailuresToRecreateVrCK, router.getId());
12411249
recreateRouter(router.getId());
12421250
}
12431251
}
@@ -2305,7 +2313,7 @@ private Provider getVrProvider(DomainRouterVO router) {
23052313
if (vrProvider == null) {
23062314
throw new CloudRuntimeException("Cannot find related virtual router provider of router: " + router.getHostName());
23072315
}
2308-
final Provider provider = Network.Provider.getProvider(vrProvider.getType().toString());
2316+
final Provider provider = Provider.getProvider(vrProvider.getType().toString());
23092317
if (provider == null) {
23102318
throw new CloudRuntimeException("Cannot find related provider of virtual router provider: " + vrProvider.getType().toString());
23112319
}
@@ -2721,7 +2729,7 @@ protected void finalizeIpAssocForNetwork(final Commands cmds, final VirtualRoute
27212729
}
27222730

27232731
protected ArrayList<? extends PublicIpAddress> getPublicIpsToApply(final Provider provider, final Long guestNetworkId,
2724-
final com.cloud.network.IpAddress.State... skipInStates) {
2732+
final IpAddress.State... skipInStates) {
27252733

27262734
final List<? extends IpAddress> userIps = _networkModel.listPublicIpsAssignedToGuestNtwk(guestNetworkId, null);
27272735

@@ -2770,7 +2778,7 @@ public boolean finalizeStart(final VirtualMachineProfile profile, final long hos
27702778
final String errorMessage = "Command: " + cmdClassName + " failed while starting virtual router";
27712779
final String errorDetails = "Details: " + answer.getDetails() + " " + answer.toString();
27722780
// add alerts for the failed commands
2773-
_alertMgr.sendAlert(AlertService.AlertType.ALERT_TYPE_DOMAIN_ROUTER, router.getDataCenterId(), router.getPodIdToDeployIn(), errorMessage, errorDetails);
2781+
_alertMgr.sendAlert(AlertType.ALERT_TYPE_DOMAIN_ROUTER, router.getDataCenterId(), router.getPodIdToDeployIn(), errorMessage, errorDetails);
27742782
logger.error(answer.getDetails());
27752783
logger.warn(errorMessage);
27762784
// Stop the router if any of the commands failed
@@ -3196,7 +3204,7 @@ public <T extends VirtualRouter> void collectNetworkStatistics(final T router, f
31963204
continue;
31973205
}
31983206
if (forVpc && network.getTrafficType() == TrafficType.Public || !forVpc && network.getTrafficType() == TrafficType.Guest
3199-
&& network.getGuestType() == Network.GuestType.Isolated) {
3207+
&& network.getGuestType() == GuestType.Isolated) {
32003208
final NetworkUsageCommand usageCmd = new NetworkUsageCommand(privateIP, router.getHostName(), forVpc, routerNic.getIPv4Address());
32013209
final String routerType = router.getType().toString();
32023210
final UserStatisticsVO previousStats = _userStatsDao.findBy(router.getAccountId(), router.getDataCenterId(), network.getId(),
@@ -3448,7 +3456,7 @@ private boolean isOutOfBandMigrated(final Object opaque) {
34483456
return false;
34493457
}
34503458

3451-
protected boolean aggregationExecution(final AggregationControlCommand.Action action, final Network network, final List<DomainRouterVO> routers)
3459+
protected boolean aggregationExecution(final Action action, final Network network, final List<DomainRouterVO> routers)
34523460
throws AgentUnavailableException, ResourceUnavailableException {
34533461

34543462
int errors = 0;

0 commit comments

Comments
 (0)