Migrate to nextgen-kernels-api message ID encoding and fix execution state sync

Zsailer · Zsailer · commit 8891e814f7fc · 2025-11-13T11:01:32.000-08:00
- Replace message_cache with extract_src_id/extract_channel utilities
- Remove obsolete test_kernel_message_cache.py
- Fix awareness sync to send execution states to reconnecting clients
- Add cell_msg_ids tracking for re-execution detection
diff --git a/jupyter_server_documents/kernel_client.py b/jupyter_server_documents/kernel_client.py
@@ -12,6 +12,7 @@
 import typing as t
 
 from nextgen_kernels_api.services.kernels.client import JupyterServerKernelClient
+from nextgen_kernels_api.services.kernels.message_utils import extract_src_id, extract_channel
 from traitlets import Instance, Set, Type, default
 
 from jupyter_server_documents.outputs import OutputProcessor
@@ -41,6 +42,9 @@ def _default_output_processor(self) -> OutputProcessor:
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+        # Track last message ID per cell to detect re-executions
+        self._cell_msg_ids: dict[str, str] = {}
+
         # Register listener for document-related messages
         # Combines state updates and outputs to share deserialization logic
         self.add_listener(
@@ -92,18 +96,19 @@ async def _handle_document_messages(self, channel_name: str, msg: list[bytes]):
             self.log.debug(f"Skipping message that can't be deserialized: {e}")
             return
 
-        # Extract parent message context for cell ID lookup
+        # Extract parent message context for cell ID and channel lookup
+        # Cell ID and channel are now encoded directly in the parent msg_id
         parent_msg_id = dmsg.get("parent_header", {}).get("msg_id")
-        parent_msg_data = self.message_cache.get(parent_msg_id) if parent_msg_id else None
-        cell_id = parent_msg_data.get("cell_id") if parent_msg_data else None
+        cell_id = extract_src_id(parent_msg_id) if parent_msg_id else None
+        parent_channel = extract_channel(parent_msg_id) if parent_msg_id else None
 
         # Dispatch to appropriate handler
         msg_type = dmsg.get("msg_type")
         match msg_type:
             case "kernel_info_reply":
                 await self._handle_kernel_info_reply(dmsg)
             case "status":
-                await self._handle_status_message(dmsg, parent_msg_data, cell_id)
+                await self._handle_status_message(dmsg, parent_channel, cell_id)
             case "execute_input":
                 await self._handle_execute_input(dmsg, cell_id)
             case "stream" | "display_data" | "execute_result" | "error" | "update_display_data" | "clear_output":
@@ -124,7 +129,7 @@ async def _handle_kernel_info_reply(self, msg: dict):
                     self.log.warning(f"Failed to update language info for yroom: {e}")
 
     async def _handle_status_message(
-        self, dmsg: dict, parent_msg_data: dict | None, cell_id: str | None
+        self, dmsg: dict, parent_channel: str | None, cell_id: str | None
     ):
         """Update kernel and cell execution states from status messages.
 
@@ -135,20 +140,14 @@ async def _handle_status_message(
         execution_state = content.get("execution_state")
 
         for yroom in self._yrooms:
-            awareness = yroom.get_awareness()
-            if awareness is None:
-                continue
-
             # Update document-level kernel status if this is a top-level status message
-            if parent_msg_data and parent_msg_data.get("channel") == "shell":
-                awareness.set_local_state_field(
-                    "kernel", {"execution_state": execution_state}
-                )
+            # (i.e., parent message came from shell channel)
+            if parent_channel == "shell":
+                yroom.set_kernel_execution_state(execution_state)
 
             # Update cell execution state for persistence and awareness
             if cell_id:
                 yroom.set_cell_execution_state(cell_id, execution_state)
-                yroom.set_cell_awareness_state(cell_id, execution_state)
                 break
 
     async def _handle_execute_input(self, dmsg: dict, cell_id: str | None):
@@ -205,10 +204,14 @@ def handle_incoming_message(self, channel_name: str, msg: list[bytes]):
 
             if cell_id:
                 # Clear outputs if this is a re-execution of the same cell
-                existing = self.message_cache.get(cell_id=cell_id)
-                if existing and existing["msg_id"] != msg_id:
+                # (different msg_id for the same cell_id)
+                last_msg_id = self._cell_msg_ids.get(cell_id)
+                if last_msg_id and last_msg_id != msg_id:
                     asyncio.create_task(self.output_processor.clear_cell_outputs(cell_id))
 
+                # Track this message ID for the cell
+                self._cell_msg_ids[cell_id] = msg_id
+
                 # Set awareness state immediately for queued cells
                 if msg_type == "execute_request" and channel_name == "shell":
                     for yroom in self._yrooms:
diff --git a/jupyter_server_documents/rooms/yroom.py b/jupyter_server_documents/rooms/yroom.py
@@ -405,38 +405,41 @@ def get_awareness(self, on_reset: Callable[[pycrdt.Awareness], Any] | None = Non
             self._on_reset_callbacks['awareness'].append(on_reset)
         return self._awareness
     
-    def get_cell_execution_states(self) -> dict:
-        """
-        Returns the persistent cell execution states for this room.
-        These states survive client disconnections but are not saved to disk.
-        """
-        if not hasattr(self, '_cell_execution_states'):
-            self._cell_execution_states: dict[str, str] = {}
-        return self._cell_execution_states
-    
     def set_cell_execution_state(self, cell_id: str, execution_state: str) -> None:
         """
-        Sets the execution state for a specific cell.
-        This state persists across client disconnections.
+        Sets the execution state for a specific cell in the awareness system.
+        This provides real-time updates to all connected clients and persists
+        while the server is running (survives client reconnections).
         """
-        if not hasattr(self, '_cell_execution_states'):
-            self._cell_execution_states = {}
-        self._cell_execution_states[cell_id] = execution_state
+        awareness = self.get_awareness()
+        if awareness is None:
+            return
+
+        local_state = awareness.get_local_state()
+        if local_state is not None:
+            cell_states = local_state.get("cell_execution_states", {})
+        else:
+            cell_states = {}
+
+        cell_states[cell_id] = execution_state
+        awareness.set_local_state_field("cell_execution_states", cell_states)
 
     def set_cell_awareness_state(self, cell_id: str, execution_state: str) -> None:
         """
-        Sets the execution state for a specific cell in the awareness system.
+        Alias for set_cell_execution_state for backward compatibility.
+        """
+        self.set_cell_execution_state(cell_id, execution_state)
+
+    def set_kernel_execution_state(self, execution_state: str) -> None:
+        """
+        Sets the kernel execution state in awareness.
         This provides real-time updates to all connected clients.
         """
         awareness = self.get_awareness()
         if awareness is not None:
-            local_state = awareness.get_local_state()
-            if local_state is not None:
-                cell_states = local_state.get("cell_execution_states", {})
-            else:
-                cell_states = {}
-            cell_states[cell_id] = execution_state
-            awareness.set_local_state_field("cell_execution_states", cell_states)
+            awareness.set_local_state_field(
+                "kernel", {"execution_state": execution_state}
+            )
 
     def add_message(self, client_id: str, message: bytes) -> None:
         """
@@ -540,6 +543,7 @@ def handle_sync_step1(self, client_id: str, message: bytes) -> None:
         - Computing a SyncStep2 reply,
         - Sending the reply to the client over WS, and
         - Sending a new SyncStep1 message immediately after.
+        - Sending awareness state to the new client.
         """
         # Mark client as desynced
         new_client = self.clients.get(client_id)
@@ -586,6 +590,22 @@ def handle_sync_step1(self, client_id: str, message: bytes) -> None:
             )
             self.log.exception(e)
 
+        # Send current awareness state to the new client
+        try:
+            # Get all awareness client IDs and broadcast the current state
+            all_client_ids = list(self._awareness._states.keys())
+            if all_client_ids:
+                awareness_update = self._awareness.encode_awareness_update(all_client_ids)
+                awareness_message = pycrdt.create_awareness_message(awareness_update)
+                assert isinstance(new_client.websocket, WebSocketHandler)
+                new_client.websocket.write_message(awareness_message, binary=True)
+        except Exception as e:
+            self.log.error(
+                f"An exception occurred when sending awareness to "
+                f"newly-synced client '{new_client.id}':"
+            )
+            self.log.exception(e)
+
 
     def handle_sync_step2(self, client_id: str, message: bytes) -> None:
         """
@@ -791,8 +811,8 @@ def _on_awareness_update(self, type: str, changes: tuple[dict[str, Any], Any]) -
         Arguments:
             type: The change type.
             changes: The awareness changes.
-        """        
-        
+        """
+
         self.log.debug(f"awareness update, type={type}, changes={changes}, changes[1]={changes[1]}, meta={self._awareness.meta}, ydoc.clientid={self._ydoc.client_id}, roomId={self.room_id}")
         updated_clients = [v for value in changes[0].values() for v in value]
         self.log.debug(f"awareness update, updated_clients={updated_clients}")
diff --git a/jupyter_server_documents/tests/test_kernel_message_cache.py b/jupyter_server_documents/tests/test_kernel_message_cache.py