Skip to content

Commit

Permalink
Fix potential deadlock with curator ZK store (slackhq#1010)
Browse files Browse the repository at this point in the history
Co-authored-by: Bryan Burkholder <[email protected]>
  • Loading branch information
bryanlb and bryanlb authored Jul 30, 2024
1 parent f947fe6 commit 61c8a08
Showing 1 changed file with 25 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import static com.slack.astra.server.AstraConfig.DEFAULT_ZK_TIMEOUT_SECS;

import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.slack.astra.util.RuntimeHalterImpl;
import java.io.Closeable;
import java.util.List;
Expand All @@ -11,6 +12,8 @@
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.curator.x.async.AsyncCuratorFramework;
Expand Down Expand Up @@ -44,6 +47,9 @@ public class AstraMetadataStore<T extends AstraMetadata> implements Closeable {
private final Map<AstraMetadataStoreChangeListener<T>, ModeledCacheListener<T>> listenerMap =
new ConcurrentHashMap<>();

private final ExecutorService cacheInitializedService;
private final ModeledCacheListener<T> initializedListener = getCacheInitializedListener();

public AstraMetadataStore(
AsyncCuratorFramework curator,
CreateMode createMode,
Expand All @@ -64,11 +70,15 @@ public AstraMetadataStore(
modeledClient = ModeledFramework.wrap(curator, modelSpec);

if (shouldCache) {
cacheInitializedService =
Executors.newSingleThreadExecutor(
new ThreadFactoryBuilder().setNameFormat("cache-initialized-service-%d").build());
cachedModeledFramework = modeledClient.cached();
cachedModeledFramework.listenable().addListener(getCacheInitializedListener());
cachedModeledFramework.listenable().addListener(initializedListener, cacheInitializedService);
cachedModeledFramework.start();
} else {
cachedModeledFramework = null;
cacheInitializedService = null;
}
}

Expand Down Expand Up @@ -204,7 +214,12 @@ public void removeListener(AstraMetadataStoreChangeListener<T> watcher) {

private void awaitCacheInitialized() {
try {
cacheInitialized.await();
if (!cacheInitialized.await(30, TimeUnit.SECONDS)) {
// in the event we deadlock, go ahead and time this out at 30s and restart the pod
new RuntimeHalterImpl()
.handleFatal(
new TimeoutException("Timed out waiting for Zookeeper cache to initialize"));
}
} catch (InterruptedException e) {
new RuntimeHalterImpl().handleFatal(e);
}
Expand All @@ -221,6 +236,14 @@ public void accept(Type type, ZPath path, Stat stat, T model) {
public void initialized() {
ModeledCacheListener.super.initialized();
cacheInitialized.countDown();

// after it's initialized, we no longer need the listener or executor
if (cachedModeledFramework != null) {
cachedModeledFramework.listenable().removeListener(initializedListener);
}
if (cacheInitializedService != null) {
cacheInitializedService.shutdown();
}
}
};
}
Expand Down

0 comments on commit 61c8a08

Please sign in to comment.