Skip to content
This repository was archived by the owner on Jun 7, 2021. It is now read-only.

Commit c43bb2e

Browse files
author
aven
committed
TRAFODION-2940 In HA env, one node lose network, when recover, trafci can not use
1 parent 360427c commit c43bb2e

10 files changed

Lines changed: 300 additions & 58 deletions

File tree

dcs/bin/scripts/dcsunbind.sh

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,31 @@ function check_node {
5151
done
5252
}
5353

54+
function check_self_node {
55+
for myinterface in `/sbin/ip link show|cut -d: -f1- | cut -c1- | awk -F': ' '/^[0-9]+:.*/ {print $2;}'`; do
56+
ip_output=$(/sbin/ip addr show $myinterface | cut -d: -f1- | cut -c1-)
57+
58+
myifport=`echo "$ip_output" | grep -w $gv_float_external_ip`
59+
status=$?
60+
if [ $status -eq 0 ]; then
61+
tempinterface=`echo $gv_float_interface:$gv_port`
62+
# check if another interface is bound to this virtual ip address
63+
echo "$myifport" | grep "$tempinterface" > /dev/null
64+
if [ $? -eq 0 ]; then
65+
unbindip=`echo "$myifport" | awk '{print $2}'`
66+
unbindlb=`echo "$myifport"|awk '{print $NF}'`
67+
echo "Virtual ip $gv_float_external_ip is in use on node $HOSTNAME bound to interface $myinterface($unbindlb) - unbinding..."
68+
sudo /sbin/ip addr del $unbindip dev $myinterface
69+
status=$?
70+
if [ $status -ne 0 ]; then
71+
echo "Failed to unbind - status is $status"
72+
exit -1
73+
fi
74+
fi # endif node+name match
75+
fi # endif looking for external ip
76+
done
77+
}
78+
5479
function Check_VirtualIP_InUse_And_Unbind {
5580
echo "check all nodes to see if external virtual ip address is in use and unbind if necessary"
5681
mynode=""
@@ -64,12 +89,23 @@ function Check_VirtualIP_InUse_And_Unbind {
6489
fi
6590
}
6691

92+
function Check_VirtualIP_InUse_And_Unbind_Self {
93+
check_self_node
94+
}
95+
6796
#Main program
6897

6998
if [[ $ENABLE_HA == "false" ]]; then
7099
exit 0
71100
fi
72101

102+
unbindSelf=false
103+
for i in "$@"; do
104+
if [[ $i=="self" ]]; then
105+
unbindSelf=true
106+
fi
107+
done
108+
73109
gv_float_internal_ip=`python $DCS_INSTALL_DIR/bin/scripts/parse_dcs_site.py|cut -d$'\n' -f2`
74110
gv_float_external_ip=`python $DCS_INSTALL_DIR/bin/scripts/parse_dcs_site.py|cut -d$'\n' -f2`
75111
gv_float_interface=`python $DCS_INSTALL_DIR/bin/scripts/parse_dcs_site.py|cut -d$'\n' -f1`
@@ -94,6 +130,10 @@ if [[ $AWS_CLOUD == "true" ]]; then
94130
echo "Detached interface :" $NETWORKINTERFACE
95131
fi
96132
else
97-
Check_VirtualIP_InUse_And_Unbind
133+
if [ $unbindSelf ]; then
134+
Check_VirtualIP_InUse_And_Unbind_Self
135+
else
136+
Check_VirtualIP_InUse_And_Unbind
137+
fi
98138
fi
99139
exit 0

dcs/src/main/java/org/trafodion/dcs/Constants.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,12 @@ public final class Constants {
559559
/** Default value for DcsMaster floating IP command */
560560
public static final String DEFAULT_DCS_MASTER_FLOATING_IP_COMMAND = "cd ${dcs.home.dir};bin/scripts/dcsbind.sh -i -a -p";
561561

562+
/** DcsMaster floating IP command unbind*/
563+
public static final String DCS_MASTER_FLOATING_IP_COMMAND_UNBIND = "dcs.master.floating.ip.command.unbind";
564+
565+
/** Default value for DcsMaster floating IP command unbind*/
566+
public static final String DEFAULT_DCS_MASTER_FLOATING_IP_COMMAND_UNBIND = "cd ${dcs.home.dir};bin/scripts/dcsunbind.sh self";
567+
562568
/** DcsMaster Floating IP external interface */
563569
public static final String DCS_MASTER_FLOATING_IP_EXTERNAL_INTERFACE = "dcs.master.floating.ip.external.interface";
564570

dcs/src/main/java/org/trafodion/dcs/master/DcsMaster.java

Lines changed: 114 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,16 @@ Licensed to the Apache Software Foundation (ASF) under one
2323
package org.trafodion.dcs.master;
2424

2525
import java.io.IOException;
26-
import java.io.InputStream;
27-
import java.net.InetAddress;
28-
import java.net.InetSocketAddress;
29-
import java.net.ServerSocket;
30-
import java.net.NetworkInterface;
31-
import java.nio.charset.Charset;
32-
import java.util.Enumeration;
33-
import java.util.*;
26+
import java.util.concurrent.Callable;
27+
import java.util.concurrent.CompletionService;
3428
import java.util.concurrent.CountDownLatch;
35-
import java.util.concurrent.Executors;
29+
import java.util.concurrent.ExecutionException;
30+
import java.util.concurrent.ExecutorCompletionService;
3631
import java.util.concurrent.ExecutorService;
32+
import java.util.concurrent.Executors;
3733
import java.util.concurrent.Future;
38-
import java.util.concurrent.ExecutionException;
34+
import java.util.concurrent.TimeUnit;
3935

40-
import org.apache.commons.io.IOUtils;
4136
import org.apache.commons.cli.CommandLine;
4237
import org.apache.commons.cli.GnuParser;
4338
import org.apache.commons.cli.Options;
@@ -46,23 +41,22 @@ Licensed to the Apache Software Foundation (ASF) under one
4641
import org.apache.commons.logging.LogFactory;
4742
import org.apache.hadoop.conf.Configuration;
4843
import org.apache.zookeeper.CreateMode;
44+
import org.apache.zookeeper.KeeperException;
4945
import org.apache.zookeeper.ZooDefs;
5046
import org.apache.zookeeper.data.Stat;
51-
import org.apache.zookeeper.KeeperException;
52-
import org.apache.zookeeper.ZooKeeper.States;
53-
54-
import org.apache.hadoop.util.StringUtils;
55-
5647
import org.trafodion.dcs.Constants;
48+
import org.trafodion.dcs.master.listener.ListenerService;
49+
import org.trafodion.dcs.master.listener.ListenerWorker;
5750
import org.trafodion.dcs.util.DcsConfiguration;
5851
import org.trafodion.dcs.util.DcsNetworkConfiguration;
5952
import org.trafodion.dcs.util.InfoServer;
53+
import org.trafodion.dcs.util.RetryCounter;
54+
import org.trafodion.dcs.util.RetryCounterFactory;
6055
import org.trafodion.dcs.util.VersionInfo;
61-
import org.trafodion.dcs.zookeeper.ZkClient;
6256
import org.trafodion.dcs.zookeeper.ZKConfig;
63-
import org.trafodion.dcs.master.listener.ListenerService;
57+
import org.trafodion.dcs.zookeeper.ZkClient;
6458

65-
public class DcsMaster implements Runnable {
59+
public class DcsMaster implements Callable<Integer> {
6660
private static final Log LOG = LogFactory.getLog(DcsMaster.class);
6761
private Thread thrd;
6862
private ZkClient zkc = null;
@@ -111,11 +105,50 @@ public DcsMaster(String[] args) {
111105
trafodionHome = System.getProperty(Constants.DCS_TRAFODION_HOME);
112106
jvmShutdownHook = new JVMShutdownHook();
113107
Runtime.getRuntime().addShutdownHook(jvmShutdownHook);
114-
thrd = new Thread(this);
115-
thrd.start();
108+
109+
// 35000 * 15mins ~= 1 years
110+
RetryCounter retryCounter = RetryCounterFactory.create(35000, 15, TimeUnit.MINUTES);
111+
ExecutorService executorService = Executors.newFixedThreadPool(1);
112+
CompletionService<Integer> completionService = new ExecutorCompletionService<Integer>(executorService);
113+
114+
while (true) {
115+
completionService.submit(this);
116+
Future<Integer> f = null;
117+
try {
118+
f = completionService.take();
119+
if (f != null) {
120+
Integer status = f.get();
121+
if (status <= 0) {
122+
System.exit(status);
123+
} else if (status == 1) {
124+
if (retryCounter.shouldRetry()) {
125+
retryCounter.sleepUntilNextRetry();
126+
retryCounter.useRetry();
127+
} else {
128+
System.exit(-2);
129+
}
130+
// reset lock
131+
isLeader = new CountDownLatch(1);
132+
break;
133+
} else {
134+
//TODO for other unknown status
135+
}
136+
}
137+
} catch (InterruptedException | ExecutionException e) {
138+
LOG.error(e.getMessage(), e);
139+
}
140+
}
141+
116142
}
117143

118-
public void run() {
144+
// return value lesser than 0, means can't recover exception exit.
145+
// -1 configure error
146+
// -2 retry exhaust
147+
// return value greater than 0 , means exception can be recover.
148+
// 1 means network error, retry till network recover.
149+
// return value equals 0, means unknow exception, do exit now.
150+
// change value other than 0 when confirm the exception real reason.
151+
public Integer call() {
119152
VersionInfo.logVersion();
120153

121154
Options opt = new Options();
@@ -129,19 +162,19 @@ public void run() {
129162
instance = "1";
130163
} catch (NullPointerException e) {
131164
LOG.error("No args found: ", e);
132-
System.exit(1);
165+
return -1;
133166
} catch (ParseException e) {
134167
LOG.error("Could not parse: ", e);
135-
System.exit(1);
168+
return -1;
136169
}
137170

138171
try {
139172
zkc = new ZkClient();
140173
zkc.connect();
141174
LOG.info("Connected to ZooKeeper");
142-
} catch (Exception e) {
143-
LOG.error(e);
144-
System.exit(1);
175+
} catch (IOException | InterruptedException e) {
176+
LOG.error(e.getMessage(), e);
177+
return 1;
145178
}
146179

147180
try {
@@ -202,9 +235,10 @@ public void run() {
202235
}
203236
} catch (KeeperException.NodeExistsException e) {
204237
// do nothing...some other server has created znodes
238+
LOG.warn(e.getMessage(), e);
205239
} catch (Exception e) {
206-
LOG.error(e);
207-
System.exit(0);
240+
LOG.error(e.getMessage(), e);
241+
return 0;
208242
}
209243

210244
metrics = new Metrics();
@@ -213,10 +247,10 @@ public void run() {
213247
try {
214248
netConf = new DcsNetworkConfiguration(conf);
215249
serverName = netConf.getHostName();
216-
if (serverName == null) {
250+
if (serverName == null) {
217251
LOG.error("DNS Interface [" + conf.get(Constants.DCS_DNS_INTERFACE, Constants.DEFAULT_DCS_DNS_INTERFACE)
218-
+ "] configured in dcs.site.xml is not found!");
219-
System.exit(1);
252+
+ "] configured in dcs.site.xml is not found!");
253+
return -1;
220254
}
221255

222256
// Wait to become the leader of all DcsMasters
@@ -229,6 +263,11 @@ public void run() {
229263
+ ":" + startTime;
230264
zkc.create(path, new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE,
231265
CreateMode.EPHEMERAL);
266+
// Add a check path here for session expired situation,
267+
// if there meets session expired, use the mark to compare with the exist znode,
268+
// if not match, that means a backup master take over the master role.
269+
zkc.setCheckPath(path);
270+
232271
LOG.info("Created znode [" + path + "]");
233272

234273
int requestTimeout = conf.getInt(
@@ -262,12 +301,50 @@ public void run() {
262301
future.get();// block
263302

264303
} catch (Exception e) {
265-
LOG.error(e);
266-
e.printStackTrace();
267-
if (pool != null)
268-
pool.shutdown();
269-
System.exit(0);
304+
LOG.error(e.getMessage(), e);
305+
try {
306+
FloatingIp floatingIp = FloatingIp.getInstance(this);
307+
floatingIp.unbindScript();
308+
} catch (Exception e1) {
309+
if (LOG.isErrorEnabled()) {
310+
LOG.error("Error creating class FloatingIp [" + e1.getMessage() + "]", e1);
311+
}
312+
}
313+
314+
if (pool != null) {
315+
try {
316+
pool.shutdown();
317+
LOG.info("Interrupt listenerService.");
318+
} catch (Exception e2) {
319+
LOG.error("Error while shutdown ServerManager thread [" + e2.getMessage() + "]", e2);
320+
}
321+
}
322+
323+
if (ls != null) {
324+
try {
325+
ListenerWorker lw = ls.getWorker();
326+
if (lw != null) {
327+
lw.interrupt();
328+
LOG.info("Interrupt listenerWorker.");
329+
}
330+
ls.interrupt();
331+
LOG.info("Interrupt listenerService.");
332+
} catch (Exception e2) {
333+
LOG.error("Error while shutdown ListenerService thread [" + e2.getMessage() + "]", e2);
334+
}
335+
}
336+
if (infoServer != null) {
337+
try {
338+
infoServer.stop();
339+
LOG.info("Stop infoServer.");
340+
} catch (Exception e2) {
341+
LOG.error("Error while shutdown InfoServer thread [" + e2.getMessage(), e2);
342+
}
343+
}
344+
return 1;
345+
270346
}
347+
return 0;
271348
}
272349

273350
public String getServerName() {

dcs/src/main/java/org/trafodion/dcs/master/FloatingIp.java

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,43 @@ public boolean isEnabled() {
6161
return isEnabled;
6262
}
6363

64+
public synchronized int unbindScript() throws Exception {
65+
if (isEnabled)
66+
LOG.info("Floating IP is enabled");
67+
else {
68+
LOG.info("Floating IP is disabled");
69+
return 0;
70+
}
71+
72+
ScriptContext scriptContext = new ScriptContext();
73+
scriptContext.setScriptName(Constants.SYS_SHELL_SCRIPT_NAME);
74+
scriptContext.setStripStdOut(false);
75+
scriptContext.setStripStdErr(false);
76+
77+
String command = master.getConfiguration().get(Constants.DCS_MASTER_FLOATING_IP_COMMAND_UNBIND,
78+
Constants.DEFAULT_DCS_MASTER_FLOATING_IP_COMMAND_UNBIND);
79+
80+
scriptContext.setCommand(command);
81+
LOG.info("Unbind Floating IP [" + scriptContext.getCommand() + "]");
82+
ScriptManager.getInstance().runScript(scriptContext);// Blocking call
83+
84+
StringBuilder sb = new StringBuilder();
85+
sb.append("exit code [" + scriptContext.getExitCode() + "]");
86+
if (!scriptContext.getStdOut().toString().isEmpty())
87+
sb.append(", stdout [" + scriptContext.getStdOut().toString() + "]");
88+
if (!scriptContext.getStdErr().toString().isEmpty())
89+
sb.append(", stderr [" + scriptContext.getStdErr().toString() + "]");
90+
if (LOG.isErrorEnabled())
91+
LOG.error(sb.toString());
92+
93+
if (scriptContext.getExitCode() == 0)
94+
LOG.info("Unbind Floating IP successful, exit code [" + 0 + "]");
95+
else
96+
LOG.error("Unbind Floating IP failed, exit code [" + scriptContext.getExitCode() + "]");
97+
98+
return scriptContext.getExitCode();
99+
}
100+
64101
public synchronized int runScript() throws Exception {
65102
if (isEnabled)
66103
LOG.info("Floating IP is enabled");

dcs/src/main/java/org/trafodion/dcs/master/ServerManager.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -328,16 +328,19 @@ public Boolean call() throws Exception {
328328
}
329329
}
330330

331+
if (!zkc.isSessionRecoverSuccessful()) {
332+
throw new Exception("error while recover zkclient session. lose zookeeper connection. restart DCS Master.");
333+
}
334+
331335
try {
332336
Thread.sleep(timeoutMillis);
333337
} catch (InterruptedException e) {
334338
}
335339
}
336340

337341
} catch (Exception e) {
338-
e.printStackTrace();
339342
if (LOG.isErrorEnabled())
340-
LOG.error(e);
343+
LOG.error(e.getMessage(), e);
341344
pool.shutdown();
342345
throw e;
343346
}

0 commit comments

Comments
 (0)