2121import java .io .PrintWriter ;
2222import java .io .StringWriter ;
2323import java .net .InetAddress ;
24+ import java .net .InetSocketAddress ;
25+ import java .net .Socket ;
2426import java .net .UnknownHostException ;
2527import java .nio .channels .ClosedChannelException ;
2628import java .nio .charset .Charset ;
3840import javax .naming .ConfigurationException ;
3941
4042import org .apache .cloudstack .agent .directdownload .SetupDirectDownloadCertificate ;
43+ import org .apache .cloudstack .agent .lb .SetupMSListAnswer ;
44+ import org .apache .cloudstack .agent .lb .SetupMSListCommand ;
4145import org .apache .cloudstack .ca .SetupCertificateAnswer ;
4246import org .apache .cloudstack .ca .SetupCertificateCommand ;
4347import org .apache .cloudstack .ca .SetupKeyStoreCommand ;
4448import org .apache .cloudstack .ca .SetupKeystoreAnswer ;
4549import org .apache .cloudstack .managed .context .ManagedContextTimerTask ;
4650import org .apache .cloudstack .utils .security .KeyStoreUtils ;
51+ import org .apache .commons .collections .CollectionUtils ;
4752import org .apache .commons .io .FileUtils ;
4853import org .apache .log4j .Logger ;
4954import org .slf4j .MDC ;
6570import com .cloud .exception .AgentControlChannelException ;
6671import com .cloud .resource .ServerResource ;
6772import com .cloud .utils .PropertiesUtil ;
73+ import com .cloud .utils .StringUtils ;
6874import com .cloud .utils .backoff .BackoffAlgorithm ;
6975import com .cloud .utils .concurrency .NamedThreadFactory ;
7076import com .cloud .utils .exception .CloudRuntimeException ;
@@ -121,6 +127,7 @@ public int value() {
121127 Long _id ;
122128
123129 Timer _timer = new Timer ("Agent Timer" );
130+ Timer hostLBTimer ;
124131
125132 List <WatchTask > _watchList = new ArrayList <WatchTask >();
126133 long _sequence = 0 ;
@@ -144,7 +151,7 @@ public Agent(final IAgentShell shell) {
144151 _shell = shell ;
145152 _link = null ;
146153
147- _connection = new NioClient ("Agent" , _shell .getHost (), _shell .getPort (), _shell .getWorkers (), this );
154+ _connection = new NioClient ("Agent" , _shell .getNextHost (), _shell .getPort (), _shell .getWorkers (), this );
148155
149156 Runtime .getRuntime ().addShutdownHook (new ShutdownThread (this ));
150157
@@ -179,7 +186,7 @@ public Agent(final IAgentShell shell, final int localAgentId, final ServerResour
179186 throw new ConfigurationException ("Unable to configure " + _resource .getName ());
180187 }
181188
182- final String host = _shell .getHost ();
189+ final String host = _shell .getNextHost ();
183190 _connection = new NioClient ("Agent" , host , _shell .getPort (), _shell .getWorkers (), this );
184191
185192 // ((NioClient)_connection).setBindAddress(_shell.getPrivateIp());
@@ -255,7 +262,7 @@ public void start() {
255262 s_logger .info ("Attempted to connect to the server, but received an unexpected exception, trying again..." );
256263 }
257264 while (!_connection .isStartup ()) {
258- final String host = _shell .getHost ();
265+ final String host = _shell .getNextHost ();
259266 _shell .getBackoffAlgorithm ().waitBeforeRetry ();
260267 _connection = new NioClient ("Agent" , host , _shell .getPort (), _shell .getWorkers (), this );
261268 s_logger .info ("Connecting to host:" + host );
@@ -266,6 +273,7 @@ public void start() {
266273 s_logger .info ("Attempted to connect to the server, but received an unexpected exception, trying again..." );
267274 }
268275 }
276+ _shell .updateConnectedHost ();
269277 }
270278
271279 public void stop (final String reason , final String detail ) {
@@ -310,6 +318,17 @@ public void setId(final Long id) {
310318 _shell .setPersistentProperty (getResourceName (), "id" , Long .toString (id ));
311319 }
312320
321+ private synchronized void scheduleHostLBCheckerTask (final long checkInterval ) {
322+ if (hostLBTimer != null ) {
323+ hostLBTimer .cancel ();
324+ }
325+ if (checkInterval > 0L ) {
326+ s_logger .info ("Scheduling preferred host timer task with host.lb.interval=" + checkInterval + "ms" );
327+ hostLBTimer = new Timer ("Host LB Timer" );
328+ hostLBTimer .scheduleAtFixedRate (new PreferredHostCheckerTask (), checkInterval , checkInterval );
329+ }
330+ }
331+
313332 public void scheduleWatch (final Link link , final Request request , final long delay , final long period ) {
314333 synchronized (_watchList ) {
315334 if (s_logger .isDebugEnabled ()) {
@@ -332,18 +351,20 @@ protected void cancelTasks() {
332351 _watchList .clear ();
333352 }
334353 }
335- public synchronized void lockStartupTask ( final Link link )
336- {
354+
355+ public synchronized void lockStartupTask ( final Link link ) {
337356 _startup = new StartupTask (link );
338357 _timer .schedule (_startup , _startupWait );
339358 }
340359
341360 public void sendStartup (final Link link ) {
342361 final StartupCommand [] startup = _resource .initialize ();
343362 if (startup != null ) {
363+ final String msHostList = _shell .getPersistentProperty (null , "host" );
344364 final Command [] commands = new Command [startup .length ];
345365 for (int i = 0 ; i < startup .length ; i ++) {
346366 setupStartupCommand (startup [i ]);
367+ startup [i ].setMSHostList (msHostList );
347368 commands [i ] = startup [i ];
348369 }
349370 final Request request = new Request (_id != null ? _id : -1 , -1 , commands , false , false );
@@ -402,19 +423,23 @@ protected void reconnect(final Link link) {
402423 }
403424 }
404425
405- link .close ();
406- link .terminated ();
426+ if (link != null ) {
427+ link .close ();
428+ link .terminated ();
429+ }
407430
408431 setLink (null );
409432 cancelTasks ();
410433
411434 _resource .disconnected ();
412435
436+ final String lastConnectedHost = _shell .getConnectedHost ();
437+
413438 int inProgress = 0 ;
414439 do {
415440 _shell .getBackoffAlgorithm ().waitBeforeRetry ();
416441
417- s_logger .info ("Lost connection to the server . Dealing with the remaining commands..." );
442+ s_logger .info ("Lost connection to host: " + lastConnectedHost + " . Dealing with the remaining commands..." );
418443
419444 inProgress = _inProgress .get ();
420445 if (inProgress > 0 ) {
@@ -434,7 +459,7 @@ protected void reconnect(final Link link) {
434459 _shell .getBackoffAlgorithm ().waitBeforeRetry ();
435460 }
436461
437- final String host = _shell .getHost ();
462+ final String host = _shell .getNextHost ();
438463 do {
439464 _connection = new NioClient ("Agent" , host , _shell .getPort (), _shell .getWorkers (), this );
440465 s_logger .info ("Reconnecting to host:" + host );
@@ -452,7 +477,8 @@ protected void reconnect(final Link link) {
452477 }
453478 _shell .getBackoffAlgorithm ().waitBeforeRetry ();
454479 } while (!_connection .isStartup ());
455- s_logger .info ("Connected to the server" );
480+ _shell .updateConnectedHost ();
481+ s_logger .info ("Connected to the host: " + _shell .getConnectedHost ());
456482 }
457483
458484 public void processStartupAnswer (final Answer answer , final Response response , final Link link ) {
@@ -554,6 +580,8 @@ protected void processRequest(final Request request, final Link link) {
554580 answer = setupAgentCertificate ((SetupCertificateCommand ) cmd );
555581 } else if (cmd instanceof SetupDirectDownloadCertificate ) {
556582 answer = setupDirectDownloadCertificate ((SetupDirectDownloadCertificate ) cmd );
583+ } else if (cmd instanceof SetupMSListCommand ) {
584+ answer = setupManagementServerList ((SetupMSListCommand ) cmd );
557585 } else {
558586 if (cmd instanceof ReadyCommand ) {
559587 processReadyCommand (cmd );
@@ -708,6 +736,30 @@ private Answer setupAgentCertificate(final SetupCertificateCommand cmd) {
708736 return new SetupCertificateAnswer (true );
709737 }
710738
739+ private void processManagementServerList (final List <String > msList , final String lbAlgorithm , final Long lbCheckInterval ) {
740+ if (CollectionUtils .isNotEmpty (msList ) && !Strings .isNullOrEmpty (lbAlgorithm )) {
741+ try {
742+ final String newMSHosts = String .format ("%s%s%s" , StringUtils .toCSVList (msList ), IAgentShell .hostLbAlgorithmSeparator , lbAlgorithm );
743+ _shell .setPersistentProperty (null , "host" , newMSHosts );
744+ _shell .setHosts (newMSHosts );
745+ _shell .resetHostCounter ();
746+ s_logger .info ("Processed new management server list: " + newMSHosts );
747+ } catch (final Exception e ) {
748+ throw new CloudRuntimeException ("Could not persist received management servers list" , e );
749+ }
750+ }
751+ if ("shuffle" .equals (lbAlgorithm )) {
752+ scheduleHostLBCheckerTask (0 );
753+ } else {
754+ scheduleHostLBCheckerTask (_shell .getLbCheckerInterval (lbCheckInterval ));
755+ }
756+ }
757+
758+ private Answer setupManagementServerList (final SetupMSListCommand cmd ) {
759+ processManagementServerList (cmd .getMsList (), cmd .getLbAlgorithm (), cmd .getLbCheckInterval ());
760+ return new SetupMSListAnswer (true );
761+ }
762+
711763 public void processResponse (final Response response , final Link link ) {
712764 final Answer answer = response .getAnswer ();
713765 if (s_logger .isDebugEnabled ()) {
@@ -728,15 +780,16 @@ public void processResponse(final Response response, final Link link) {
728780 }
729781
730782 public void processReadyCommand (final Command cmd ) {
731-
732783 final ReadyCommand ready = (ReadyCommand )cmd ;
733784
734- s_logger .info ("Proccess agent ready command, agent id = " + ready .getHostId ());
785+ s_logger .info ("Processing agent ready command, agent id = " + ready .getHostId ());
735786 if (ready .getHostId () != null ) {
736787 setId (ready .getHostId ());
737788 }
738- s_logger .info ("Ready command is processed: agent id = " + getId ());
739789
790+ processManagementServerList (ready .getMsHostList (), ready .getLbAlgorithm (), ready .getLbCheckInterval ());
791+
792+ s_logger .info ("Ready command is processed for agent id = " + getId ());
740793 }
741794
742795 public void processOtherTask (final Task task ) {
@@ -1018,4 +1071,44 @@ public void doTask(final Task task) throws TaskExecutionException {
10181071 }
10191072 }
10201073 }
1074+
1075+ public class PreferredHostCheckerTask extends ManagedContextTimerTask {
1076+
1077+ @ Override
1078+ protected void runInContext () {
1079+ try {
1080+ final String [] msList = _shell .getHosts ();
1081+ if (msList == null || msList .length < 1 ) {
1082+ return ;
1083+ }
1084+ final String preferredHost = msList [0 ];
1085+ final String connectedHost = _shell .getConnectedHost ();
1086+ if (s_logger .isTraceEnabled ()) {
1087+ s_logger .trace ("Running preferred host checker task, connected host=" + connectedHost + ", preferred host=" + preferredHost );
1088+ }
1089+ if (preferredHost != null && !preferredHost .equals (connectedHost ) && _link != null ) {
1090+ boolean isHostUp = true ;
1091+ try (final Socket socket = new Socket ()) {
1092+ socket .connect (new InetSocketAddress (preferredHost , _shell .getPort ()), 5000 );
1093+ } catch (final IOException e ) {
1094+ isHostUp = false ;
1095+ if (s_logger .isTraceEnabled ()) {
1096+ s_logger .trace ("Host: " + preferredHost + " is not reachable" );
1097+ }
1098+ }
1099+ if (isHostUp && _link != null && _inProgress .get () == 0 ) {
1100+ if (s_logger .isDebugEnabled ()) {
1101+ s_logger .debug ("Preferred host " + preferredHost + " is found to be reachable, trying to reconnect" );
1102+ }
1103+ _shell .resetHostCounter ();
1104+ reconnect (_link );
1105+ }
1106+ }
1107+ } catch (Throwable t ) {
1108+ s_logger .error ("Error caught while attempting to connect to preferred host" , t );
1109+ }
1110+ }
1111+
1112+ }
1113+
10211114}
0 commit comments