-
-
Notifications
You must be signed in to change notification settings - Fork 1
[DNM] Add Arroyo batch timeout logic #461
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
trevor-e
wants to merge
3
commits into
main
Choose a base branch
from
telkins/process-timeout
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
| from arroyo import Message, Topic, configure_metrics | ||
| from arroyo.backends.kafka import KafkaConsumer as ArroyoKafkaConsumer | ||
| from arroyo.backends.kafka import KafkaPayload | ||
| from arroyo.dlq import InvalidMessage | ||
| from arroyo.processing.processor import StreamProcessor | ||
| from arroyo.processing.strategies import ProcessingStrategy, ProcessingStrategyFactory | ||
| from arroyo.processing.strategies.commit import CommitOffsets | ||
|
|
@@ -62,19 +63,92 @@ def __init__( | |
| self, | ||
| function: Callable[[Message[TStrategyPayload]], Any], | ||
| next_step: ProcessingStrategy[FilteredPayload | Any], | ||
| max_batch_size: int, | ||
| max_batch_time: float, | ||
| pool: MultiprocessingPool, | ||
| input_block_size: int | None = None, | ||
| output_block_size: int | None = None, | ||
| batch_timeout: float | None = None, | ||
| ) -> None: | ||
| super().__init__(function, next_step, max_batch_size, max_batch_time, pool, input_block_size, output_block_size) | ||
| super().__init__(function, next_step, 1, 1, pool, input_block_size, output_block_size) | ||
|
|
||
| self._batch_timeout = batch_timeout | ||
| self._batch_submit_times: dict[int, float] = {} # Maps batch id to submission timestamp | ||
|
|
||
| # Override SIGCHLD handler - child exits are expected with maxtasksperchild=1 | ||
| signal.signal( | ||
| signal.SIGCHLD, | ||
| lambda signum, frame: logger.debug(f"Worker process exited normally (SIGCHLD {signum})"), | ||
| ) | ||
|
|
||
| def poll(self) -> None: | ||
| if self._batch_timeout is not None: | ||
| try: | ||
| self._check_batch_timeouts() | ||
| except Exception as e: | ||
| logger.error(f"Error checking batch timeouts: {e}", exc_info=True) | ||
|
|
||
| super().poll() | ||
|
|
||
| def _check_batch_timeouts(self) -> None: | ||
| """ | ||
| Check if any in-flight batches have exceeded timeout and terminate those workers. | ||
|
|
||
| This accesses parent class private members via name mangling. If parent class | ||
| changes these member names, this will break. | ||
| """ | ||
|
|
||
| try: | ||
| processes = self._RunTaskWithMultiprocessing__processes | ||
| pool = self._RunTaskWithMultiprocessing__pool | ||
| invalid_messages = self._RunTaskWithMultiprocessing__invalid_messages | ||
| except AttributeError: | ||
| logger.exception("Failed to access parent class private members - please check if the Arroyo API changed:") | ||
| return | ||
|
|
||
| # Track batch times to know which batch is timed out | ||
| # Make sure to clean up finished/timed out batches so this doesn't grow unbounded | ||
| if processes: | ||
| current_batch_ids = {id(batch) for batch in processes} | ||
| completed_batch_ids = [bid for bid in self._batch_submit_times if bid not in current_batch_ids] | ||
| for batch_id in completed_batch_ids: | ||
| del self._batch_submit_times[batch_id] | ||
| else: | ||
| self._batch_submit_times.clear() | ||
| return | ||
|
|
||
| # Check the first batch, we only support batchsize=1 right now | ||
| first_batch = processes[0] | ||
| batch_id = id(first_batch) | ||
|
|
||
| if batch_id not in self._batch_submit_times: | ||
| self._batch_submit_times[batch_id] = time.time() | ||
| return | ||
|
|
||
| elapsed = time.time() - self._batch_submit_times[batch_id] | ||
|
|
||
| if elapsed > self._batch_timeout: | ||
| logger.error(f"Batch exceeded timeout of {self._batch_timeout}s (elapsed={elapsed:.2f}s).") | ||
|
|
||
| input_batch = first_batch[0] | ||
|
|
||
| pool.close() # Terminates all workers | ||
| pool.maybe_create_pool() # Recreate fresh pool | ||
|
|
||
| # Remove timed-out batch from queue | ||
| processes.popleft() | ||
|
|
||
| # Convert batch messages to InvalidMessages for DLQ | ||
| for idx, message in input_batch: | ||
| invalid_msg = InvalidMessage( | ||
| message.value.partition, | ||
| message.value.offset, | ||
| reason=f"Batch processing exceeded {self._batch_timeout}s timeout", | ||
| ) | ||
| invalid_messages.append(invalid_msg) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This part I wasn't entirely sure about, but I didn't think I could raise |
||
|
|
||
| del self._batch_submit_times[batch_id] | ||
|
|
||
| logger.info(f"Terminated worker and sent {len(input_batch)} messages to DLQ") | ||
|
|
||
|
|
||
| def process_kafka_message_with_service(msg: Message[KafkaPayload]) -> Any: | ||
| """Process a Kafka message using the actual service logic in a worker process.""" | ||
|
|
@@ -134,6 +208,7 @@ def create_kafka_consumer() -> LaunchpadKafkaConsumer: | |
|
|
||
| topics = [Topic(topic) for topic in config.topics] | ||
| topic = topics[0] if topics else Topic("default") | ||
|
|
||
| processor = StreamProcessor( | ||
| consumer=arroyo_consumer, | ||
| topic=topic, | ||
|
|
@@ -262,11 +337,10 @@ def create_with_partitions( | |
| strategy = LaunchpadRunTaskWithMultiprocessing( | ||
| process_kafka_message_with_service, | ||
| next_step=next_step, | ||
| max_batch_size=1, # Process immediately, subject to be re-tuned | ||
| max_batch_time=1, # Process after 1 second max, subject to be re-tuned | ||
| pool=self._pool, | ||
| input_block_size=None, | ||
| output_block_size=None, | ||
| batch_timeout=60.0 * 12, # 12 minutes | ||
| ) | ||
|
|
||
| return strategy | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is brittle but we'll have alerting if something breaks around this. Timed out messages are pretty rare and should have plenty of time to react in this case.