-
Notifications
You must be signed in to change notification settings - Fork 15.3k
KAFKA-10317: Global thread should honor shutdown signal during bootstrapping #22417
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: trunk
Are you sure you want to change the base?
Changes from 4 commits
7c26aca
1902941
627150c
bf57fef
567b610
56abb15
7bddbf1
09935c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ | |
| import org.apache.kafka.common.PartitionInfo; | ||
| import org.apache.kafka.common.TopicPartition; | ||
| import org.apache.kafka.common.errors.TimeoutException; | ||
| import org.apache.kafka.common.errors.WakeupException; | ||
| import org.apache.kafka.common.metrics.Sensor; | ||
| import org.apache.kafka.common.serialization.ByteArrayDeserializer; | ||
| import org.apache.kafka.common.utils.Time; | ||
|
|
@@ -64,6 +65,7 @@ | |
| import java.util.Objects; | ||
| import java.util.Optional; | ||
| import java.util.Set; | ||
| import java.util.function.BooleanSupplier; | ||
| import java.util.function.Supplier; | ||
|
|
||
| import static org.apache.kafka.streams.StreamsConfig.PROCESSING_EXCEPTION_HANDLER_CLASS_CONFIG; | ||
|
|
@@ -123,14 +125,16 @@ private static class StateStoreMetadata { | |
| private DeserializationExceptionHandler deserializationExceptionHandler; | ||
| private ProcessingExceptionHandler processingExceptionHandler; | ||
| private Sensor droppedRecordsSensor; | ||
| private BooleanSupplier inErrorStateSupplier; | ||
|
|
||
| public GlobalStateManagerImpl(final LogContext logContext, | ||
| final Time time, | ||
| final ProcessorTopology topology, | ||
| final Consumer<byte[], byte[]> globalConsumer, | ||
| final StateDirectory stateDirectory, | ||
| final StateRestoreListener stateRestoreListener, | ||
| final StreamsConfig config) { | ||
| final StreamsConfig config, | ||
| final BooleanSupplier inErrorStateSupplier) { | ||
| this.time = time; | ||
| this.topology = topology; | ||
| this.stateDirectory = stateDirectory; | ||
|
|
@@ -147,6 +151,7 @@ public GlobalStateManagerImpl(final LogContext logContext, | |
| logPrefix = logContext.logPrefix(); | ||
| this.globalConsumer = globalConsumer; | ||
| this.stateRestoreListener = stateRestoreListener; | ||
| this.inErrorStateSupplier = inErrorStateSupplier; | ||
|
|
||
| final Map<String, Object> consumerProps = config.getGlobalConsumerConfigs("dummy"); | ||
| // need to add mandatory configs; otherwise `QuietConsumerConfig` throws | ||
|
|
@@ -209,6 +214,10 @@ public Set<String> initialize() { | |
| LegacyCheckpointingStateStore.migrateLegacyOffsets(logPrefix, stateDirectory, null, wrappedStores); | ||
|
|
||
| for (final StateStoreMetadata metadata : storeMetadata.values()) { | ||
| if (inErrorStateSupplier.getAsBoolean()) { | ||
| log.info("Global store bootstrap interrupted by shutdown before starting {}", metadata.stateStore.name()); | ||
| break; | ||
| } | ||
| // load the committed offsets from the store | ||
| final StateStore store = metadata.stateStore; | ||
| if (store.persistent()) { | ||
|
|
@@ -348,7 +357,21 @@ private void reprocessState(final StateStoreMetadata storeMetadata) { | |
| // TODO with https://issues.apache.org/jira/browse/KAFKA-10315 we can just call | ||
| // `poll(pollMS)` without adding the request timeout and do a more precise | ||
| // timeout handling | ||
| final ConsumerRecords<byte[], byte[]> records = globalConsumer.poll(pollMsPlusRequestTimeout); | ||
| if (inErrorStateSupplier.getAsBoolean()) { | ||
| logBootstrapInterrupted(storeMetadata); | ||
| return; | ||
| } | ||
|
|
||
| final ConsumerRecords<byte[], byte[]> records; | ||
| try { | ||
| records = globalConsumer.poll(pollMsPlusRequestTimeout); | ||
| } catch (final WakeupException e) { | ||
| if (inErrorStateSupplier.getAsBoolean()) { | ||
| logBootstrapInterrupted(storeMetadata); | ||
| return; | ||
| } | ||
| throw e; | ||
| } | ||
| if (records.isEmpty()) { | ||
| currentDeadline = maybeUpdateDeadlineOrThrow(currentDeadline); | ||
| } else { | ||
|
|
@@ -493,7 +516,20 @@ private void restoreState(final StateStoreMetadata storeMetadata) { | |
| // TODO with https://issues.apache.org/jira/browse/KAFKA-10315 we can just call | ||
| // `poll(pollMS)` without adding the request timeout and do a more precise | ||
| // timeout handling | ||
| final ConsumerRecords<byte[], byte[]> records = globalConsumer.poll(pollMsPlusRequestTimeout); | ||
| if (inErrorStateSupplier.getAsBoolean()) { | ||
| logBootstrapInterrupted(storeMetadata); | ||
| return; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we make the shutdown-interrupted bootstrap path explicit instead of returning normally from Currently, when Since What do you think?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @chickenchickenlove , thanks for the review!
|
||
| } | ||
| final ConsumerRecords<byte[], byte[]> records; | ||
| try { | ||
| records = globalConsumer.poll(pollMsPlusRequestTimeout); | ||
| } catch (final WakeupException e) { | ||
| if (inErrorStateSupplier.getAsBoolean()) { | ||
| logBootstrapInterrupted(storeMetadata); | ||
| return; | ||
| } | ||
| throw e; | ||
| } | ||
| if (records.isEmpty()) { | ||
| currentDeadline = maybeUpdateDeadlineOrThrow(currentDeadline); | ||
| } else { | ||
|
|
@@ -518,6 +554,10 @@ private void restoreState(final StateStoreMetadata storeMetadata) { | |
| } | ||
| } | ||
|
|
||
| private void logBootstrapInterrupted(final StateStoreMetadata storeMetadata) { | ||
| log.info("Bootstrap interrupted by shutdown for {}", storeMetadata.stateStore.name()); | ||
| } | ||
|
|
||
| private long getGlobalConsumerOffset(final TopicPartition topicPartition) { | ||
| return retryUntilSuccessOrThrowOnTaskTimeout( | ||
| () -> globalConsumer.position(topicPartition), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ | |
| import org.apache.kafka.common.TopicPartition; | ||
| import org.apache.kafka.common.Uuid; | ||
| import org.apache.kafka.common.errors.TimeoutException; | ||
| import org.apache.kafka.common.errors.WakeupException; | ||
| import org.apache.kafka.common.internals.KafkaFutureImpl; | ||
| import org.apache.kafka.common.utils.Time; | ||
| import org.apache.kafka.common.utils.internals.LogContext; | ||
|
|
@@ -299,7 +300,13 @@ public void run() { | |
| if (size != -1L) { | ||
| cache.resize(size); | ||
| } | ||
| stateConsumer.pollAndUpdate(); | ||
| try { | ||
| stateConsumer.pollAndUpdate(); | ||
| } catch (final WakeupException e) { | ||
| if (!inErrorState()) { | ||
| throw e; | ||
| } | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is not part of bootstrapping, is it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch! The steady-state catch wasn't really about bootstrap. I removed it, and also tightened |
||
|
|
||
| if (fetchDeadlineClientInstanceId != -1) { | ||
| if (fetchDeadlineClientInstanceId >= time.milliseconds()) { | ||
|
|
@@ -382,7 +389,8 @@ private StateConsumer initialize() { | |
| globalConsumer, | ||
| stateDirectory, | ||
| stateRestoreListener, | ||
| config | ||
| config, | ||
| this::inErrorState | ||
| ); | ||
|
|
||
| final GlobalProcessorContextImpl globalProcessorContext = new GlobalProcessorContextImpl( | ||
|
|
@@ -429,8 +437,21 @@ private StateConsumer initialize() { | |
| ); | ||
| } | ||
|
|
||
| if (inErrorState()) { | ||
| closeStateConsumer(stateConsumer, false); | ||
| return null; | ||
| } | ||
|
Comment on lines
+433
to
+436
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for which scenario we need this check?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The previous This follow-up checks the situation where shutdown is already requested, and routes to cleanup, causes the run() loop to go to the early-exit path.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right, I see it now. thanks for clarification |
||
|
|
||
| setState(RUNNING); | ||
| return stateConsumer; | ||
| } catch (final WakeupException e) { | ||
| closeStateConsumer(stateConsumer, false); | ||
| if (inErrorState()) { | ||
| log.info("Global thread initialization interrupted by shutdown"); | ||
| } else { | ||
| startupException = new StreamsException( | ||
| "Unexpected wakeup during initialization of GlobalStreamThread", e); | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this part should be enough: if (inErrorState()) {
closeStateConsumer(stateConsumer, false);
return null;
}Do we need to catch
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for flagging this! For the Right now after applying the suggestion of @chickenchickenlove , this catch is no longer needed and has been removed. All |
||
| } catch (final StreamsException fatalException) { | ||
| closeStateConsumer(stateConsumer, false); | ||
| startupException = fatalException; | ||
|
|
@@ -477,6 +498,7 @@ public void shutdown() { | |
| // if already shutting down or dead | ||
| setState(PENDING_SHUTDOWN); | ||
| initializationLatch.countDown(); | ||
| globalConsumer.wakeup(); | ||
| } | ||
|
|
||
| public Map<MetricName, Metric> consumerMetrics() { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like a lot of code duplication. Can we move it to dedicated method, or keep it on high level?
AFAIU, when restoration will be completed (on current store) and we'll move to next one we'll interrupt it anyway. Kinda trade-off to not check same condition n-times
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the advice!
I have removed the check inside the
restoreStateorreprocessStateand instead just have outer supplier check at per-store look inGlobalStateManagerImpl.initialize(), returning empty directly instead of continuing to the next store.