-
Notifications
You must be signed in to change notification settings - Fork 235
[cuegui] Add gRPC keepalive and automatic channel recovery #2138
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,6 +36,7 @@ | |
| import grpc | ||
|
|
||
| import opencue | ||
| from opencue.cuebot import Cuebot | ||
| from opencue_proto import job_pb2 | ||
|
|
||
| import cuegui.AbstractTreeWidget | ||
|
|
@@ -483,14 +484,20 @@ def _getUpdate(self): | |
| try: | ||
| if self.__job: | ||
| self.__lastUpdateTime = int(time.time()) | ||
| return self.__job.getFrames(**self.frameSearch.options) | ||
| result = self.__job.getFrames(**self.frameSearch.options) | ||
| # Record successful call for connection health tracking | ||
| Cuebot.recordSuccessfulCall() | ||
| return result | ||
| return [] | ||
| except grpc.RpcError as e: | ||
| # Handle gRPC errors - log but don't crash, allow UI to retry | ||
| # pylint: disable=no-member | ||
| if hasattr(e, 'code') and e.code() in [grpc.StatusCode.CANCELLED, | ||
| grpc.StatusCode.UNAVAILABLE]: | ||
| logger.warning("gRPC connection interrupted during frame update, will retry") | ||
| # Record failed call and potentially reset the channel | ||
| if Cuebot.recordFailedCall(): | ||
| logger.info("Channel reset due to connection issues, retrying") | ||
| else: | ||
| logger.error("gRPC error in _getUpdate: %s", e) | ||
| # pylint: enable=no-member | ||
|
|
@@ -515,13 +522,18 @@ def _getUpdateChanged(self): | |
| self.__lastUpdateTime = updated_data.server_time | ||
| self.__jobState = updated_data.state | ||
| updatedFrames = updated_data.updated_frames.updated_frames | ||
| # Record successful call for connection health tracking | ||
| Cuebot.recordSuccessfulCall() | ||
|
|
||
| except grpc.RpcError as e: | ||
| # Handle gRPC errors - allow UI to continue and retry | ||
| # pylint: disable=no-member | ||
| if hasattr(e, 'code'): | ||
| if e.code() in [grpc.StatusCode.CANCELLED, grpc.StatusCode.UNAVAILABLE]: | ||
| logger.warning("gRPC connection interrupted during frame update, will retry") | ||
| # Record failed call and potentially reset the channel | ||
| if Cuebot.recordFailedCall(): | ||
| logger.info("Channel reset due to connection issues, retrying") | ||
|
Comment on lines
+534
to
+536
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move this check to RetryOnRpcErrorClientInterceptor as explained above. |
||
| # Return None to trigger a full update on next cycle | ||
| return None | ||
| if e.code() == grpc.StatusCode.NOT_FOUND: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -73,6 +73,14 @@ | |
| DEFAULT_MAX_MESSAGE_BYTES = 1024 ** 2 * 10 | ||
| DEFAULT_GRPC_PORT = 8443 | ||
|
|
||
| # gRPC keepalive settings to prevent "Connection reset by peer" errors | ||
| # These settings help maintain long-lived connections through load balancers and firewalls | ||
| DEFAULT_KEEPALIVE_TIME_MS = 30000 # Send keepalive ping every 30 seconds | ||
| DEFAULT_KEEPALIVE_TIMEOUT_MS = 10000 # Wait 10 seconds for keepalive response | ||
| DEFAULT_KEEPALIVE_PERMIT_WITHOUT_CALLS = True # Send keepalive even when no active RPCs | ||
| DEFAULT_MAX_CONNECTION_IDLE_MS = 0 # Disable max idle time (keep connection open) | ||
| DEFAULT_MAX_CONNECTION_AGE_MS = 0 # Disable max connection age | ||
|
Comment on lines
+78
to
+82
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not all these constants are being used when creating the channel. Please sanitize. |
||
|
|
||
| if platform.system() != 'Darwin': | ||
| # Avoid spamming users with epoll fork warning messages | ||
| os.environ["GRPC_POLL_STRATEGY"] = "epoll1" | ||
|
|
@@ -92,6 +100,12 @@ class Cuebot(object): | |
| Config = opencue.config.load_config_from_file() | ||
| Timeout = Config.get('cuebot.timeout', 10000) | ||
|
|
||
| # Connection health tracking | ||
| _lastSuccessfulCall = 0 | ||
| _consecutiveFailures = 0 | ||
| _maxConsecutiveFailures = 3 # Reset channel after this many failures | ||
| _channelResetInProgress = False | ||
|
|
||
| PROTO_MAP = { | ||
| 'action': filter_pb2, | ||
| 'allocation': facility_pb2, | ||
|
|
@@ -199,10 +213,32 @@ def setChannel(): | |
| # pylint: enable=logging-not-lazy | ||
| # TODO(bcipriano) Configure gRPC TLS. (Issue #150) | ||
| try: | ||
| # Configure keepalive settings to prevent "Connection reset by peer" errors | ||
| # These are essential for long-lived connections through load balancers | ||
| keepalive_time_ms = Cuebot.Config.get( | ||
| 'cuebot.keepalive_time_ms', DEFAULT_KEEPALIVE_TIME_MS) | ||
| keepalive_timeout_ms = Cuebot.Config.get( | ||
| 'cuebot.keepalive_timeout_ms', DEFAULT_KEEPALIVE_TIMEOUT_MS) | ||
| keepalive_permit_without_calls = Cuebot.Config.get( | ||
| 'cuebot.keepalive_permit_without_calls', DEFAULT_KEEPALIVE_PERMIT_WITHOUT_CALLS) | ||
|
Comment on lines
+222
to
+223
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This config has to be set on both server and client to have effect. As this is not configured on the server, it has no effect. |
||
|
|
||
| channel_options = [ | ||
| ('grpc.max_send_message_length', maxMessageBytes), | ||
| ('grpc.max_receive_message_length', maxMessageBytes), | ||
| # Keepalive settings to maintain connection health | ||
| ('grpc.keepalive_time_ms', keepalive_time_ms), | ||
| ('grpc.keepalive_timeout_ms', keepalive_timeout_ms), | ||
| ('grpc.keepalive_permit_without_calls', keepalive_permit_without_calls), | ||
| # Allow client to send keepalive pings even without data | ||
| ('grpc.http2.max_pings_without_data', 0), | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't recommend removing empty ping limitations. If you open a python shell and import pycue for example, a connection will be created, if you don't sent any data, the channel will live forever until the shell is closed. This has the potential to overwhelm the server with too many opened empty channels. The default value is 2, maybe we can increase it to 10. But given how noisy cuegui's communication with Cuebot is, I doubt there's a period of inactivity where pings are being sent without payload. |
||
| # Minimum time between pings (allows more frequent pings) | ||
| ('grpc.http2.min_time_between_pings_ms', 10000), | ||
| # Don't limit ping strikes (server may reject too many pings) | ||
| ('grpc.http2.min_ping_interval_without_data_ms', 5000), | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is mainly a server side configuration, and it needs to be set in both server and client. Reducing the empty-ping interval might have the opposite effect of what you expect. Read grpc.config
|
||
| ] | ||
|
|
||
| Cuebot.RpcChannel = grpc.intercept_channel( | ||
| grpc.insecure_channel(connectStr, options=[ | ||
| ('grpc.max_send_message_length', maxMessageBytes), | ||
| ('grpc.max_receive_message_length', maxMessageBytes)]), | ||
| grpc.insecure_channel(connectStr, options=channel_options), | ||
| *interceptors) | ||
| # Test the connection | ||
| Cuebot.getStub('cue').GetSystemStats( | ||
|
|
@@ -302,6 +338,62 @@ def getConfig(): | |
| """Gets the Cuebot config object, originally read in from the config file on disk.""" | ||
| return Cuebot.Config | ||
|
|
||
| @staticmethod | ||
| def recordSuccessfulCall(): | ||
| """Record a successful gRPC call to track connection health.""" | ||
| Cuebot._lastSuccessfulCall = time.time() | ||
| Cuebot._consecutiveFailures = 0 | ||
|
|
||
| @staticmethod | ||
| def recordFailedCall(): | ||
| """Record a failed gRPC call and trigger channel reset if needed. | ||
|
|
||
| Returns True if the channel was reset and the caller should retry.""" | ||
| Cuebot._consecutiveFailures += 1 | ||
|
|
||
| if Cuebot._consecutiveFailures >= Cuebot._maxConsecutiveFailures: | ||
| if not Cuebot._channelResetInProgress: | ||
| Cuebot._channelResetInProgress = True | ||
| try: | ||
| logger.warning( | ||
| "Connection appears unhealthy after %d consecutive failures, " | ||
| "resetting gRPC channel...", Cuebot._consecutiveFailures) | ||
| Cuebot.resetChannel() | ||
| Cuebot._consecutiveFailures = 0 | ||
| return True | ||
| except Exception as e: | ||
| logger.error("Failed to reset gRPC channel: %s", e) | ||
| finally: | ||
| Cuebot._channelResetInProgress = False | ||
| return False | ||
|
|
||
| @staticmethod | ||
| def checkChannelHealth(): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't find where this method is being called outside of a unit test context. Am I missing something? |
||
| """Check if the gRPC channel is healthy by making a simple call. | ||
|
|
||
| Returns True if healthy, False otherwise.""" | ||
| if Cuebot.RpcChannel is None: | ||
| return False | ||
|
|
||
| try: | ||
| Cuebot.getStub('cue').GetSystemStats( | ||
| cue_pb2.CueGetSystemStatsRequest(), timeout=5000) | ||
| Cuebot.recordSuccessfulCall() | ||
| return True | ||
| except grpc.RpcError as e: | ||
| # pylint: disable=no-member | ||
| if hasattr(e, 'code') and e.code() == grpc.StatusCode.UNAVAILABLE: | ||
| details = e.details() if hasattr(e, 'details') else str(e) | ||
| logger.warning("Channel health check failed: %s", details) | ||
| Cuebot.recordFailedCall() | ||
| return False | ||
| # pylint: enable=no-member | ||
| # Other errors might be OK (e.g., permission issues) | ||
| return True | ||
| except Exception as e: | ||
| logger.warning("Channel health check failed with unexpected error: %s", e) | ||
| return False | ||
|
|
||
|
|
||
| # Python 2/3 compatible implementation of ABC | ||
| ABC = abc.ABCMeta('ABC', (object,), {'__slots__': ()}) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using a static method call to act as a channel health-check is error prune and not recommended. What is the rationale behind having this on frameMonitorTree and not LayerMonitorTree for example? It looks like a patch for a symptom and not the facing the actual illness.
If you want to implement a logic to keep track of successful calls, please use the class RetryOnRpcErrorClientInterceptor which intercepts every call to the grpc channel.