解决这类问题最高效的办法就是看看其他人是否已经解决过了,如果没有的话,那就只能依赖监控数据,依赖能收集到的各类错误信息推断,依赖搭建一个debug源码的环境的方式了,不过,我们很快发现其他人也遇到类似的问题,产生的根源是在zk 客户端在重连过程中,因为一次性提交的watcher数量太大,从而使用这个packet的大小超过了server端读取超过最大的设定值
src/java/main/org/apache/zookeeper/server/NIOServerCnxn.java
/** Reads the first 4 bytes of lenBuffer, which could be true length or
* four letter word.
*
* @param k selection key
* @return true if length read, otw false (wasn't really the length)
* @throws IOException if buffer size exceeds maxBuffer size
*/
private boolean readLength(SelectionKey k) throws IOException {
// Read the length, now get the buffer
int len = lenBuffer.getInt();
if (!initialized && checkFourLetterWord(sk, len)) {
return false;
}
if (len < 0 || len > BinaryInputArchive.maxBuffer) {
throw new IOException("Len error " + len);
}
if (!isZKServerRunning()) {
throw new IOException("ZooKeeperServer not running");
}
incomingBuffer = ByteBuffer.allocate(len);
return true;
}
//start zk server
startServerInstance();
dropZKDB();
//start client
CuratorFrameworkFactory.Builder builder = CuratorFrameworkFactory.builder().connectString(host + ":" + port).retryPolicy(
new ExponentialBackoffRetry(200, 3)).connectionTimeoutMs(CONNECTION_TIMEOUT).sessionTimeoutMs(60000).namespace(
"zk-demo1");
client = builder.build();
client.start();
//创建1000个节点每个节点都设有watcher
prepareData();
//shutdown zk server
shutdownServerInstance();
awaitDisconnect(CONNECTION_TIMEOUT);
//restart zk server
startServerInstance();
echo cons | nc {zookeeper1 ip} 2181 | grep {clientIP} |awk -F '\\(|\\)' '{print $2}' | awk -F ',' '{print $4}'
echo wchc | nc 10.172.1.200 2181 | sed -n "/^$arg/,/^0x/p";
0xb624774d87a00c2
/disconf/mooc_0.0.1_online/file/concurrencyControl.xml
0xb624774d87a00c4
/imooc-online/locks_v3/mocUserLessonUnitLearn_4096/1123/_c_8708388e-0a40-4ea9-82aa-9c5cfd41002d-lock-0001329502
/imooc-online/locks_v3/trackMobileLogonLocker_1024/651/_c_c1c6756b-7f76-4e73-9bc3-f9697ed57d5d-lock-0000163994
/imooc-online/locks_v3/mocUserLessonUnitLearn_4096/753/_c_03c07a2a-71ae-4deb-b1b8-9936e133b89f-lock-0001330917
/imooc-online/locks_v3/trackMobileLogonLocker_1024/17/_c_ce3628b2-5438-43f1-aa43-fe9a570150bf-lock-0000164342
/imooc-online/locks_v3/mocUserLessonUnitLearn_4096/3290/_c_70a8efbc-932a-4484-ba28-fc3abc885dfd-lock-0001320518
/imooc-online/locks_v3/mocUserLessonUnitLearn_4096/3075/_c_a9bbd2e1-8535-4bee-b6eb-f976577224b5-lock-0001289817
/imooc-online/locks_v3/mocUserLessonUnitLearn_4096/2094/_c_d4848800-ffc6-4b3a-a3bd-dff9e6adb812-lock-0001306049
/imooc-online/locks_v3/mocUserLessonUnitLearn_4096/2176/_c_cc5234a1-45cf-4290-bea5-1e4589bdeb4f-lock-0001327665
....
0x9624774d48800e0
/dubbo/com.netease.mooc.activity.remote.service.RemoteMocActivityTeamService/configurators
/dubbo/com.netease.mooc.course.remote.service.RemoteMocCourseCategoryService/routers
/dubbo/com.netease.mooc.course.remote.service.RemoteMocCourseService/routers
/dubbo/com.netease.mooc.activity.remote.service.RemoteMocActivityService/routers
/dubbo/com.netease.mooc.activity.remote.service.RemoteMocActivityCounterService/providers
/dubbo/com.netease.edu.biz.message.share.service.Message2AdapterService/configurators
/dubbo/com.netease.mooc.course.manager.MocCourseManager/routers
/dubbo/com.netease.mooc.activity.remote.service.RemoteMocActivityService/providers
/dubbo/com.netease.mooc.attachment.manager.MocTestAnswerAttachmentManager/routers
/dubbo/com.netease.mooc.remote.service.Remote3LevelExtraAddressService/providers
...
int threadCount = 100;
ExecutorService es = Executors.newFixedThreadPool(threadCount);
for (int i = 0; i < threadCount; i++) {
final InterProcessMutex ipm = new InterProcessMutex(this.client, "/locks");
final int seq = i;
es.submit(new Runnable() {
@Override
public void run() {
boolean acquire = false;
try {
//only one can acquire the locker
acquire = ipm.acquire(20, TimeUnit.SECONDS);
if (acquire) {
LOG.info("i am thread No.:" + seq);
//waiting
TimeUnit.SECONDS.sleep(30);
} else {
LOG.info("acquired failed!, number:" + seq);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (acquire) {
LOG.info("i am released, number:" + seq);
ipm.release();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
});
}
es.shutdown();
while (!es.awaitTermination(1, TimeUnit.SECONDS)) ;
LOG.info("find the list of watchers not removed properly:\n" + FourLetterWordMain.send4LetterWord(host, port, "wchc"));
--- a/src/java/main/org/apache/zookeeper/server/DataTree.java
+++ b/src/java/main/org/apache/zookeeper/server/DataTree.java
@@ -572,15 +572,17 @@ public String getMaxPrefixWithQuota(String path) {
public Stat statNode(String path, Watcher watcher)
throws KeeperException.NoNodeException {
- Stat stat = new Stat();
- DataNode n = nodes.get(path);
- if (watcher != null) {
- dataWatches.addWatch(path, watcher);
- }
- if (n == null) {
- throw new KeeperException.NoNodeException();
- }
synchronized (n) {
+
+ Stat stat = new Stat();
+ DataNode n = nodes.get(path);
+ if (n == null) {
+ throw new KeeperException.NoNodeException();
+ }
+
+ if (watcher != null) {
+ dataWatches.addWatch(path, watcher);
+ }
n.copyStat(stat);
return stat;
}
重新打包测试,测试后问题得到解决。