diff --git a/.gitattributes b/.gitattributes
index e465856f93..4525642f10 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -12,6 +12,8 @@
 # Prevent from counting in the language statistics
 engine/artifacts/** linguist-generated=true
 engine/sdks/** linguist-generated=true
+engine/sdks/typescript/runner/** linguist-generated=false
+engine/sdks/typescript/test-runner/** linguist-generated=false
 engine/sdks/schema/** linguist-generated=false
 
 website/public/llms.txt linguist-generated=true
diff --git a/Cargo.lock b/Cargo.lock
index c3fdd146ac..ac09eba5ab 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -742,7 +742,7 @@ dependencies = [
 
 [[package]]
 name = "clickhouse-inserter"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "async-channel",
@@ -766,7 +766,7 @@ dependencies = [
 
 [[package]]
 name = "clickhouse-user-query"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "clickhouse",
  "serde",
@@ -1369,7 +1369,7 @@ checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
 
 [[package]]
 name = "epoxy"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "axum 0.8.4",
@@ -1408,7 +1408,7 @@ dependencies = [
 
 [[package]]
 name = "epoxy-protocol"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "rivet-util",
@@ -1666,7 +1666,7 @@ dependencies = [
 
 [[package]]
 name = "gasoline"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "async-stream",
@@ -1713,7 +1713,7 @@ dependencies = [
 
 [[package]]
 name = "gasoline-macros"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -2341,7 +2341,7 @@ dependencies = [
 
 [[package]]
 name = "internal"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "gasoline",
@@ -2739,7 +2739,7 @@ dependencies = [
 
 [[package]]
 name = "namespace"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "epoxy",
@@ -3211,7 +3211,7 @@ checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
 
 [[package]]
 name = "pegboard"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "base64 0.22.1",
@@ -3240,7 +3240,7 @@ dependencies = [
 
 [[package]]
 name = "pegboard-actor-kv"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "futures-util",
@@ -3259,7 +3259,7 @@ dependencies = [
 
 [[package]]
 name = "pegboard-gateway"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -3275,7 +3275,9 @@ dependencies = [
  "rivet-guard-core",
  "rivet-runner-protocol",
  "rivet-util",
+ "scc",
  "serde",
+ "serde_json",
  "thiserror 1.0.69",
  "tokio",
  "tokio-tungstenite",
@@ -3286,7 +3288,7 @@ dependencies = [
 
 [[package]]
 name = "pegboard-runner"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -3319,7 +3321,7 @@ dependencies = [
 
 [[package]]
 name = "pegboard-serverless"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "base64 0.22.1",
@@ -4018,7 +4020,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-api-builder"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "axum 0.8.4",
@@ -4062,7 +4064,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-api-peer"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "axum 0.8.4",
@@ -4091,7 +4093,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-api-public"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "axum 0.8.4",
@@ -4123,7 +4125,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-api-types"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "gasoline",
@@ -4138,7 +4140,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-api-util"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "axum 0.8.4",
@@ -4158,7 +4160,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-bootstrap"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "epoxy",
  "gasoline",
@@ -4174,7 +4176,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-cache"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "futures-util",
@@ -4215,14 +4217,14 @@ dependencies = [
 
 [[package]]
 name = "rivet-cache-result"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "rivet-util",
 ]
 
 [[package]]
 name = "rivet-config"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "config",
@@ -4240,7 +4242,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-data"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "gasoline",
@@ -4254,7 +4256,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-dump-openapi"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "rivet-api-public",
  "serde_json",
@@ -4263,7 +4265,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-engine"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "axum 0.8.4",
@@ -4323,7 +4325,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-env"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "lazy_static",
@@ -4333,7 +4335,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-error"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "indoc",
@@ -4345,7 +4347,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-error-macros"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "indoc",
  "proc-macro2",
@@ -4356,7 +4358,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-guard"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "axum 0.8.4",
@@ -4389,6 +4391,7 @@ dependencies = [
  "serde",
  "serde_json",
  "tokio",
+ "tokio-tungstenite",
  "tower 0.5.2",
  "tracing",
  "universaldb",
@@ -4399,7 +4402,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-guard-core"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -4444,7 +4447,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-logs"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "chrono",
@@ -4458,7 +4461,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-metrics"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "console-subscriber",
@@ -4476,7 +4479,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-pools"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "async-nats",
@@ -4509,7 +4512,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-runner-protocol"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "base64 0.22.1",
@@ -4524,7 +4527,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-runtime"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "console-subscriber",
@@ -4550,7 +4553,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-service-manager"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "chrono",
@@ -4565,7 +4568,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-telemetry"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "rivet-config",
@@ -4577,7 +4580,7 @@ dependencies = [
 [[package]]
 name = "rivet-term"
 version = "0.1.0"
-source = "git+https://github.com/rivet-gg/rivet-term?rev=55e328470b68c557fb9bc8298369f90182d35b6d#55e328470b68c557fb9bc8298369f90182d35b6d"
+source = "git+https://github.com/rivet-dev/rivet-term?rev=55e328470b68c557fb9bc8298369f90182d35b6d#55e328470b68c557fb9bc8298369f90182d35b6d"
 dependencies = [
  "console",
  "derive_builder 0.12.0",
@@ -4589,7 +4592,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-test-deps"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "futures-util",
@@ -4607,7 +4610,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-test-deps-docker"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "portpicker",
@@ -4638,7 +4641,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-types"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "gasoline",
@@ -4655,7 +4658,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-ups-protocol"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "base64 0.22.1",
@@ -4668,7 +4671,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-util"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -4697,7 +4700,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-util-id"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "serde",
  "thiserror 1.0.69",
@@ -4708,7 +4711,7 @@ dependencies = [
 
 [[package]]
 name = "rivet-workflow-worker"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "epoxy",
@@ -4929,6 +4932,12 @@ version = "1.0.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
 
+[[package]]
+name = "saa"
+version = "5.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f895faf11c46e98547f4de603a113ca76708d4b6832dbbe3c26528b7b81aca3b"
+
 [[package]]
 name = "safe_arch"
 version = "0.7.4"
@@ -4938,6 +4947,16 @@ dependencies = [
  "bytemuck",
 ]
 
+[[package]]
+name = "scc"
+version = "3.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd0b9e1890c5b17833a779c68a974f04170dfa36e3789395d17845418cc779ac"
+dependencies = [
+ "saa",
+ "sdd",
+]
+
 [[package]]
 name = "schannel"
 version = "0.1.27"
@@ -5019,6 +5038,12 @@ dependencies = [
  "untrusted",
 ]
 
+[[package]]
+name = "sdd"
+version = "4.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a8729f5224c38cb041e72fa9968dd4e379d3487b85359539d31d75ed95992d8"
+
 [[package]]
 name = "sealed"
 version = "0.4.0"
@@ -6325,7 +6350,7 @@ checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c"
 
 [[package]]
 name = "universaldb"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -6352,7 +6377,7 @@ dependencies = [
 
 [[package]]
 name = "universalpubsub"
-version = "25.8.2"
+version = "2.0.22-rc.1"
 dependencies = [
  "anyhow",
  "async-nats",
@@ -6481,9 +6506,8 @@ checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
 
 [[package]]
 name = "vbare"
-version = "0.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acf4d898b11572484cc064900e2a63dc88f72c621c2c52fd032b14537668702e"
+version = "0.0.3"
+source = "git+https://github.com/rivet-dev/vbare?rev=3ae474a0234801bb96d70bec4eddd4f2d640971e#3ae474a0234801bb96d70bec4eddd4f2d640971e"
 dependencies = [
  "anyhow",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 4b9d52c8ce..3c55f6a56d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -59,6 +59,7 @@ regex = "1.4"
 rstest = "0.26.1"
 rustls-pemfile = "2.2.0"
 rustyline = "15.0.0"
+scc = "3.3.2"
 serde_bare = "0.5.0"
 serde_html_form = "0.2.7"
 serde_yaml = "0.9.34"
@@ -147,7 +148,7 @@ version = "4.3"
 features = ["derive"]
 
 [workspace.dependencies.rivet-term]
-git = "https://github.com/rivet-gg/rivet-term"
+git = "https://github.com/rivet-dev/rivet-term"
 rev = "55e328470b68c557fb9bc8298369f90182d35b6d"
 
 [workspace.dependencies.clickhouse]
@@ -248,7 +249,8 @@ default-features = false
 features = ["ansi","fmt","json","env-filter"]
 
 [workspace.dependencies.vbare]
-version = "0.0.2"
+git = "https://github.com/rivet-dev/vbare"
+rev = "3ae474a0234801bb96d70bec4eddd4f2d640971e"
 
 [workspace.dependencies.vbare-compiler]
 version = "0.0.2"
diff --git a/engine/artifacts/errors/guard.websocket_pending_limit_reached.json b/engine/artifacts/errors/guard.websocket_pending_limit_reached.json
new file mode 100644
index 0000000000..770c084ec3
--- /dev/null
+++ b/engine/artifacts/errors/guard.websocket_pending_limit_reached.json
@@ -0,0 +1,5 @@
+{
+  "code": "websocket_pending_limit_reached",
+  "group": "guard",
+  "message": "Reached limit on pending websocket messages, aborting connection."
+}
\ No newline at end of file
diff --git a/engine/artifacts/errors/guard.websocket_service_retry.json b/engine/artifacts/errors/guard.websocket_service_retry.json
new file mode 100644
index 0000000000..e73bbbc507
--- /dev/null
+++ b/engine/artifacts/errors/guard.websocket_service_retry.json
@@ -0,0 +1,5 @@
+{
+  "code": "websocket_service_retry",
+  "group": "guard",
+  "message": "WebSocket service retry."
+}
\ No newline at end of file
diff --git a/engine/artifacts/errors/guard.websocket_service_timeout.json b/engine/artifacts/errors/guard.websocket_service_timeout.json
new file mode 100644
index 0000000000..41adc87d1c
--- /dev/null
+++ b/engine/artifacts/errors/guard.websocket_service_timeout.json
@@ -0,0 +1,5 @@
+{
+  "code": "websocket_service_timeout",
+  "group": "guard",
+  "message": "WebSocket service timed out."
+}
\ No newline at end of file
diff --git a/engine/artifacts/openapi.json b/engine/artifacts/openapi.json
index bb4dbf7b3b..517c5fa80b 100644
--- a/engine/artifacts/openapi.json
+++ b/engine/artifacts/openapi.json
@@ -11,7 +11,7 @@
       "name": "Apache-2.0",
       "identifier": "Apache-2.0"
     },
-    "version": "25.8.2"
+    "version": "2.0.22-rc.1"
   },
   "paths": {
     "/actors": {
diff --git a/engine/docker/dev-host/grafana/dashboards/traces.json b/engine/docker/dev-host/grafana/dashboards/traces.json
index 54c2d0aefb..9bb36ad163 100644
--- a/engine/docker/dev-host/grafana/dashboards/traces.json
+++ b/engine/docker/dev-host/grafana/dashboards/traces.json
@@ -173,7 +173,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "timeseries",
-					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  ServiceName,\r\n  count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n  ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n",
+					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  ServiceName,\r\n  count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n  ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n",
 					"refId": "A"
 				}
 			],
@@ -431,7 +431,7 @@
 					},
 					"pluginVersion": "4.9.0",
 					"queryType": "table",
-					"rawSql": "SELECT\r\n  (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n  min(Timestamp) as Ts,\r\n  TraceId as `Trace ID`,\r\n  argMin(ServiceName, Timestamp) as `Service Name`,\r\n  argMin(SpanName, Timestamp) as `Span Name`,\r\n  argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n  divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n  AND ServiceName != 'loadgenerator'\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n",
+					"rawSql": "SELECT\r\n  (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n  min(Timestamp) as Ts,\r\n  TraceId as `Trace ID`,\r\n  argMin(ServiceName, Timestamp) as `Service Name`,\r\n  argMin(SpanName, Timestamp) as `Span Name`,\r\n  argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n  divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n  ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n  AND ServiceName != 'loadgenerator'\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n",
 					"refId": "A"
 				}
 			],
@@ -581,7 +581,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "traces",
-					"rawSql": "WITH\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\t$__conditionalAll(traceID, $trace_id) AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000",
+					"rawSql": "WITH\n\t'${trace_id}' as trace_id,\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\ttraceID = trace_id AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000",
 					"refId": "A"
 				}
 			],
@@ -722,7 +722,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "timeseries",
-					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  count(*) as ` `,\r\n  ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(TraceId, $trace_id)\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id)\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n  AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000",
+					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  count(*) as ` `,\r\n  ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(TraceId IN (${trace_id:singlequote}),  $trace_id)\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n  AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000",
 					"refId": "A"
 				}
 			],
@@ -886,14 +886,14 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Span",
 				"multi": true,
 				"name": "span_name",
 				"options": [],
-				"query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
@@ -908,14 +908,14 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Ray ID",
 				"multi": true,
 				"name": "ray_id",
 				"options": [],
-				"query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
@@ -930,29 +930,17 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Workflow ID",
 				"multi": true,
 				"name": "workflow_id",
 				"options": [],
-				"query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
-			},
-			{
-				"current": {
-					"text": "30",
-					"value": "30"
-				},
-				"hide": 2,
-				"label": "Metric Export Interval (seconds)",
-				"name": "metric_interval",
-				"query": "30",
-				"skipUrlSync": true,
-				"type": "constant"
 			}
 		]
 	},
diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/traces.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/traces.json
index 54c2d0aefb..9bb36ad163 100644
--- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/traces.json
+++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/traces.json
@@ -173,7 +173,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "timeseries",
-					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  ServiceName,\r\n  count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n  ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n",
+					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  ServiceName,\r\n  count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n  ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n",
 					"refId": "A"
 				}
 			],
@@ -431,7 +431,7 @@
 					},
 					"pluginVersion": "4.9.0",
 					"queryType": "table",
-					"rawSql": "SELECT\r\n  (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n  min(Timestamp) as Ts,\r\n  TraceId as `Trace ID`,\r\n  argMin(ServiceName, Timestamp) as `Service Name`,\r\n  argMin(SpanName, Timestamp) as `Span Name`,\r\n  argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n  divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n  AND ServiceName != 'loadgenerator'\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n",
+					"rawSql": "SELECT\r\n  (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n  min(Timestamp) as Ts,\r\n  TraceId as `Trace ID`,\r\n  argMin(ServiceName, Timestamp) as `Service Name`,\r\n  argMin(SpanName, Timestamp) as `Span Name`,\r\n  argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n  divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n  ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n  AND ServiceName != 'loadgenerator'\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n",
 					"refId": "A"
 				}
 			],
@@ -581,7 +581,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "traces",
-					"rawSql": "WITH\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\t$__conditionalAll(traceID, $trace_id) AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000",
+					"rawSql": "WITH\n\t'${trace_id}' as trace_id,\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\ttraceID = trace_id AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000",
 					"refId": "A"
 				}
 			],
@@ -722,7 +722,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "timeseries",
-					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  count(*) as ` `,\r\n  ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(TraceId, $trace_id)\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id)\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n  AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000",
+					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  count(*) as ` `,\r\n  ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(TraceId IN (${trace_id:singlequote}),  $trace_id)\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n  AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000",
 					"refId": "A"
 				}
 			],
@@ -886,14 +886,14 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Span",
 				"multi": true,
 				"name": "span_name",
 				"options": [],
-				"query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
@@ -908,14 +908,14 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Ray ID",
 				"multi": true,
 				"name": "ray_id",
 				"options": [],
-				"query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
@@ -930,29 +930,17 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Workflow ID",
 				"multi": true,
 				"name": "workflow_id",
 				"options": [],
-				"query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
-			},
-			{
-				"current": {
-					"text": "30",
-					"value": "30"
-				},
-				"hide": 2,
-				"label": "Metric Export Interval (seconds)",
-				"name": "metric_interval",
-				"query": "30",
-				"skipUrlSync": true,
-				"type": "constant"
 			}
 		]
 	},
diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/traces.json b/engine/docker/dev-multidc/core/grafana/dashboards/traces.json
index 54c2d0aefb..9bb36ad163 100644
--- a/engine/docker/dev-multidc/core/grafana/dashboards/traces.json
+++ b/engine/docker/dev-multidc/core/grafana/dashboards/traces.json
@@ -173,7 +173,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "timeseries",
-					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  ServiceName,\r\n  count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n  ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n",
+					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  ServiceName,\r\n  count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n  ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n",
 					"refId": "A"
 				}
 			],
@@ -431,7 +431,7 @@
 					},
 					"pluginVersion": "4.9.0",
 					"queryType": "table",
-					"rawSql": "SELECT\r\n  (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n  min(Timestamp) as Ts,\r\n  TraceId as `Trace ID`,\r\n  argMin(ServiceName, Timestamp) as `Service Name`,\r\n  argMin(SpanName, Timestamp) as `Span Name`,\r\n  argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n  divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n  AND ServiceName != 'loadgenerator'\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n",
+					"rawSql": "SELECT\r\n  (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n  min(Timestamp) as Ts,\r\n  TraceId as `Trace ID`,\r\n  argMin(ServiceName, Timestamp) as `Service Name`,\r\n  argMin(SpanName, Timestamp) as `Span Name`,\r\n  argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n  divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n  ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n  AND ServiceName != 'loadgenerator'\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n",
 					"refId": "A"
 				}
 			],
@@ -581,7 +581,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "traces",
-					"rawSql": "WITH\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\t$__conditionalAll(traceID, $trace_id) AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000",
+					"rawSql": "WITH\n\t'${trace_id}' as trace_id,\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\ttraceID = trace_id AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000",
 					"refId": "A"
 				}
 			],
@@ -722,7 +722,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "timeseries",
-					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  count(*) as ` `,\r\n  ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(TraceId, $trace_id)\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id)\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n  AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000",
+					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  count(*) as ` `,\r\n  ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(TraceId IN (${trace_id:singlequote}),  $trace_id)\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n  AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000",
 					"refId": "A"
 				}
 			],
@@ -886,14 +886,14 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Span",
 				"multi": true,
 				"name": "span_name",
 				"options": [],
-				"query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
@@ -908,14 +908,14 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Ray ID",
 				"multi": true,
 				"name": "ray_id",
 				"options": [],
-				"query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
@@ -930,29 +930,17 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Workflow ID",
 				"multi": true,
 				"name": "workflow_id",
 				"options": [],
-				"query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
-			},
-			{
-				"current": {
-					"text": "30",
-					"value": "30"
-				},
-				"hide": 2,
-				"label": "Metric Export Interval (seconds)",
-				"name": "metric_interval",
-				"query": "30",
-				"skipUrlSync": true,
-				"type": "constant"
 			}
 		]
 	},
diff --git a/engine/docker/dev-multinode/grafana/dashboards/traces.json b/engine/docker/dev-multinode/grafana/dashboards/traces.json
index 54c2d0aefb..9bb36ad163 100644
--- a/engine/docker/dev-multinode/grafana/dashboards/traces.json
+++ b/engine/docker/dev-multinode/grafana/dashboards/traces.json
@@ -173,7 +173,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "timeseries",
-					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  ServiceName,\r\n  count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n  ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n",
+					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  ServiceName,\r\n  count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n  ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n",
 					"refId": "A"
 				}
 			],
@@ -431,7 +431,7 @@
 					},
 					"pluginVersion": "4.9.0",
 					"queryType": "table",
-					"rawSql": "SELECT\r\n  (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n  min(Timestamp) as Ts,\r\n  TraceId as `Trace ID`,\r\n  argMin(ServiceName, Timestamp) as `Service Name`,\r\n  argMin(SpanName, Timestamp) as `Span Name`,\r\n  argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n  divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n  AND ServiceName != 'loadgenerator'\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n",
+					"rawSql": "SELECT\r\n  (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n  min(Timestamp) as Ts,\r\n  TraceId as `Trace ID`,\r\n  argMin(ServiceName, Timestamp) as `Service Name`,\r\n  argMin(SpanName, Timestamp) as `Span Name`,\r\n  argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n  divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n  ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n  AND ServiceName != 'loadgenerator'\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n",
 					"refId": "A"
 				}
 			],
@@ -581,7 +581,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "traces",
-					"rawSql": "WITH\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\t$__conditionalAll(traceID, $trace_id) AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000",
+					"rawSql": "WITH\n\t'${trace_id}' as trace_id,\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\ttraceID = trace_id AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000",
 					"refId": "A"
 				}
 			],
@@ -722,7 +722,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "timeseries",
-					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  count(*) as ` `,\r\n  ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(TraceId, $trace_id)\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id)\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n  AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000",
+					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  count(*) as ` `,\r\n  ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(TraceId IN (${trace_id:singlequote}),  $trace_id)\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n  AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000",
 					"refId": "A"
 				}
 			],
@@ -886,14 +886,14 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Span",
 				"multi": true,
 				"name": "span_name",
 				"options": [],
-				"query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
@@ -908,14 +908,14 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Ray ID",
 				"multi": true,
 				"name": "ray_id",
 				"options": [],
-				"query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
@@ -930,29 +930,17 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Workflow ID",
 				"multi": true,
 				"name": "workflow_id",
 				"options": [],
-				"query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
-			},
-			{
-				"current": {
-					"text": "30",
-					"value": "30"
-				},
-				"hide": 2,
-				"label": "Metric Export Interval (seconds)",
-				"name": "metric_interval",
-				"query": "30",
-				"skipUrlSync": true,
-				"type": "constant"
 			}
 		]
 	},
diff --git a/engine/docker/dev/docker-compose.yml b/engine/docker/dev/docker-compose.yml
index 313c6d0c4e..6733fda6b5 100644
--- a/engine/docker/dev/docker-compose.yml
+++ b/engine/docker/dev/docker-compose.yml
@@ -130,6 +130,8 @@ services:
     networks:
       - rivet-network
       - rivet-network-to-core
+    ports:
+      - '4317:4317'
   otel-collector-client:
     image: otel/opentelemetry-collector-contrib:latest
     restart: unless-stopped
diff --git a/engine/docker/dev/grafana/dashboards/traces.json b/engine/docker/dev/grafana/dashboards/traces.json
index 54c2d0aefb..9bb36ad163 100644
--- a/engine/docker/dev/grafana/dashboards/traces.json
+++ b/engine/docker/dev/grafana/dashboards/traces.json
@@ -173,7 +173,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "timeseries",
-					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  ServiceName,\r\n  count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n  ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n",
+					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  ServiceName,\r\n  count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n  ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n",
 					"refId": "A"
 				}
 			],
@@ -431,7 +431,7 @@
 					},
 					"pluginVersion": "4.9.0",
 					"queryType": "table",
-					"rawSql": "SELECT\r\n  (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n  min(Timestamp) as Ts,\r\n  TraceId as `Trace ID`,\r\n  argMin(ServiceName, Timestamp) as `Service Name`,\r\n  argMin(SpanName, Timestamp) as `Span Name`,\r\n  argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n  divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n  AND ServiceName != 'loadgenerator'\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n",
+					"rawSql": "SELECT\r\n  (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n  min(Timestamp) as Ts,\r\n  TraceId as `Trace ID`,\r\n  argMin(ServiceName, Timestamp) as `Service Name`,\r\n  argMin(SpanName, Timestamp) as `Span Name`,\r\n  argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n  divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n  ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n  AND ServiceName != 'loadgenerator'\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n",
 					"refId": "A"
 				}
 			],
@@ -581,7 +581,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "traces",
-					"rawSql": "WITH\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\t$__conditionalAll(traceID, $trace_id) AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000",
+					"rawSql": "WITH\n\t'${trace_id}' as trace_id,\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\ttraceID = trace_id AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000",
 					"refId": "A"
 				}
 			],
@@ -722,7 +722,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "timeseries",
-					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  count(*) as ` `,\r\n  ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(TraceId, $trace_id)\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id)\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n  AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000",
+					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  count(*) as ` `,\r\n  ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(TraceId IN (${trace_id:singlequote}),  $trace_id)\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n  AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000",
 					"refId": "A"
 				}
 			],
@@ -886,14 +886,14 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Span",
 				"multi": true,
 				"name": "span_name",
 				"options": [],
-				"query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
@@ -908,14 +908,14 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Ray ID",
 				"multi": true,
 				"name": "ray_id",
 				"options": [],
-				"query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
@@ -930,29 +930,17 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Workflow ID",
 				"multi": true,
 				"name": "workflow_id",
 				"options": [],
-				"query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
-			},
-			{
-				"current": {
-					"text": "30",
-					"value": "30"
-				},
-				"hide": 2,
-				"label": "Metric Export Interval (seconds)",
-				"name": "metric_interval",
-				"query": "30",
-				"skipUrlSync": true,
-				"type": "constant"
 			}
 		]
 	},
diff --git a/engine/docker/template/grafana-dashboards/traces.json b/engine/docker/template/grafana-dashboards/traces.json
index 54c2d0aefb..9bb36ad163 100644
--- a/engine/docker/template/grafana-dashboards/traces.json
+++ b/engine/docker/template/grafana-dashboards/traces.json
@@ -173,7 +173,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "timeseries",
-					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  ServiceName,\r\n  count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n  ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n",
+					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  ServiceName,\r\n  count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n  ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n",
 					"refId": "A"
 				}
 			],
@@ -431,7 +431,7 @@
 					},
 					"pluginVersion": "4.9.0",
 					"queryType": "table",
-					"rawSql": "SELECT\r\n  (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n  min(Timestamp) as Ts,\r\n  TraceId as `Trace ID`,\r\n  argMin(ServiceName, Timestamp) as `Service Name`,\r\n  argMin(SpanName, Timestamp) as `Span Name`,\r\n  argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n  divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n  AND ServiceName != 'loadgenerator'\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n",
+					"rawSql": "SELECT\r\n  (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n  min(Timestamp) as Ts,\r\n  TraceId as `Trace ID`,\r\n  argMin(ServiceName, Timestamp) as `Service Name`,\r\n  argMin(SpanName, Timestamp) as `Span Name`,\r\n  argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n  divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n  ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n  AND ServiceName != 'loadgenerator'\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n",
 					"refId": "A"
 				}
 			],
@@ -581,7 +581,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "traces",
-					"rawSql": "WITH\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\t$__conditionalAll(traceID, $trace_id) AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000",
+					"rawSql": "WITH\n\t'${trace_id}' as trace_id,\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\ttraceID = trace_id AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000",
 					"refId": "A"
 				}
 			],
@@ -722,7 +722,7 @@
 					},
 					"pluginVersion": "4.0.6",
 					"queryType": "timeseries",
-					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  count(*) as ` `,\r\n  ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(TraceId, $trace_id)\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND $__conditionalAll(ServiceName, $service_name)\r\n  AND $__conditionalAll(SpanName, $span_name)\r\n  AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id)\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n  AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000",
+					"rawSql": "SELECT\r\n  $__timeInterval(Timestamp) as time,\r\n  count(*) as ` `,\r\n  ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n  $__conditionalAll(TraceId IN (${trace_id:singlequote}),  $trace_id)\r\n  AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n  AND ServiceName IN (${service_name:singlequote})\r\n  AND SpanName IN (${span_name:singlequote})\r\n  AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n  AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n  AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000",
 					"refId": "A"
 				}
 			],
@@ -886,14 +886,14 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Span",
 				"multi": true,
 				"name": "span_name",
 				"options": [],
-				"query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
@@ -908,14 +908,14 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Ray ID",
 				"multi": true,
 				"name": "ray_id",
 				"options": [],
-				"query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
@@ -930,29 +930,17 @@
 					"type": "grafana-clickhouse-datasource",
 					"uid": "clickhouse"
 				},
-				"definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"description": "",
 				"includeAll": true,
 				"label": "Workflow ID",
 				"multi": true,
 				"name": "workflow_id",
 				"options": [],
-				"query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;",
+				"query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;",
 				"refresh": 1,
 				"regex": "",
 				"type": "query"
-			},
-			{
-				"current": {
-					"text": "30",
-					"value": "30"
-				},
-				"hide": 2,
-				"label": "Metric Export Interval (seconds)",
-				"name": "metric_interval",
-				"query": "30",
-				"skipUrlSync": true,
-				"type": "constant"
 			}
 		]
 	},
diff --git a/engine/packages/api-public/src/actors/get_or_create.rs b/engine/packages/api-public/src/actors/get_or_create.rs
index 8f0403eef3..3fd6071442 100644
--- a/engine/packages/api-public/src/actors/get_or_create.rs
+++ b/engine/packages/api-public/src/actors/get_or_create.rs
@@ -1,8 +1,5 @@
 use anyhow::Result;
-use axum::{
-	http::HeaderMap,
-	response::{IntoResponse, Response},
-};
+use axum::response::{IntoResponse, Response};
 use rivet_api_builder::{
 	ApiError,
 	extract::{Extension, Json, Query},
@@ -77,11 +74,10 @@ pub struct GetOrCreateResponse {
 )]
 pub async fn get_or_create(
 	Extension(ctx): Extension<ApiCtx>,
-	headers: HeaderMap,
 	Query(query): Query<GetOrCreateQuery>,
 	Json(body): Json<GetOrCreateRequest>,
 ) -> Response {
-	match get_or_create_inner(ctx, headers, query, body).await {
+	match get_or_create_inner(ctx, query, body).await {
 		Ok(response) => Json(response).into_response(),
 		Err(err) => ApiError::from(err).into_response(),
 	}
@@ -90,7 +86,6 @@ pub async fn get_or_create(
 #[tracing::instrument(skip_all)]
 async fn get_or_create_inner(
 	ctx: ApiCtx,
-	headers: HeaderMap,
 	query: GetOrCreateQuery,
 	body: GetOrCreateRequest,
 ) -> Result<GetOrCreateResponse> {
diff --git a/engine/packages/api-public/src/health.rs b/engine/packages/api-public/src/health.rs
index d74a60827b..9528d716f8 100644
--- a/engine/packages/api-public/src/health.rs
+++ b/engine/packages/api-public/src/health.rs
@@ -87,7 +87,7 @@ async fn fanout_inner(ctx: ApiCtx) -> Result<FanoutResponse> {
 				}
 			} else {
 				// Remote datacenter - HTTP request
-				match send_health_checks(&ctx, &dc).await {
+				match send_health_checks(&dc).await {
 					Ok(response) => DatacenterHealth {
 						datacenter_label: dc.datacenter_label,
 						datacenter_name: dc.name.clone(),
@@ -129,7 +129,6 @@ async fn fanout_inner(ctx: ApiCtx) -> Result<FanoutResponse> {
 
 #[tracing::instrument(skip_all)]
 async fn send_health_checks(
-	ctx: &ApiCtx,
 	dc: &rivet_config::config::topology::Datacenter,
 ) -> Result<HealthResponse> {
 	let client = rivet_pools::reqwest::client().await?;
diff --git a/engine/packages/api-public/src/metadata.rs b/engine/packages/api-public/src/metadata.rs
index 694143b042..c4a7a2cb64 100644
--- a/engine/packages/api-public/src/metadata.rs
+++ b/engine/packages/api-public/src/metadata.rs
@@ -1,6 +1,6 @@
 use axum::Json;
 use axum::response::IntoResponse;
-use rivet_api_builder::{ApiError, extract::Extension};
+use rivet_api_builder::extract::Extension;
 use serde_json::json;
 
 use crate::ctx::ApiCtx;
diff --git a/engine/packages/api-public/src/runner_configs/delete.rs b/engine/packages/api-public/src/runner_configs/delete.rs
index caa966cfd0..551986683f 100644
--- a/engine/packages/api-public/src/runner_configs/delete.rs
+++ b/engine/packages/api-public/src/runner_configs/delete.rs
@@ -1,4 +1,4 @@
-use anyhow::{Context, Result};
+use anyhow::Result;
 use axum::response::{IntoResponse, Response};
 use futures_util::{StreamExt, TryStreamExt};
 use rivet_api_builder::{
diff --git a/engine/packages/api-public/src/runner_configs/utils.rs b/engine/packages/api-public/src/runner_configs/utils.rs
index cb2d0939c3..85c5afbb96 100644
--- a/engine/packages/api-public/src/runner_configs/utils.rs
+++ b/engine/packages/api-public/src/runner_configs/utils.rs
@@ -114,7 +114,7 @@ pub async fn fetch_serverless_runner_metadata(
 		});
 	}
 
-	let payload = serde_json::from_str::<ServerlessMetadataPayload>(&body_raw).map_err(|err| {
+	let payload = serde_json::from_str::<ServerlessMetadataPayload>(&body_raw).map_err(|_| {
 		ServerlessMetadataError::InvalidResponseJson {
 			body: body_for_user,
 		}
diff --git a/engine/packages/epoxy/src/http_client.rs b/engine/packages/epoxy/src/http_client.rs
index 779670c57c..1e325b0bbd 100644
--- a/engine/packages/epoxy/src/http_client.rs
+++ b/engine/packages/epoxy/src/http_client.rs
@@ -135,7 +135,7 @@ pub async fn send_message_to_address(
 	let client = rivet_pools::reqwest::client().await?;
 
 	// Create the request
-	let request = versioned::Request::latest(request);
+	let request = versioned::Request::wrap_latest(request);
 
 	// Send the request
 	let response_result = client
diff --git a/engine/packages/epoxy/src/http_routes.rs b/engine/packages/epoxy/src/http_routes.rs
index 79ba36d00d..e2b987f542 100644
--- a/engine/packages/epoxy/src/http_routes.rs
+++ b/engine/packages/epoxy/src/http_routes.rs
@@ -1,6 +1,6 @@
 use anyhow::*;
 use axum::body::Bytes;
-use epoxy_protocol::{protocol, versioned};
+use epoxy_protocol::versioned;
 use rivet_api_builder::prelude::*;
 use vbare::OwnedVersionedData;
 
@@ -30,5 +30,5 @@ pub async fn message(ctx: ApiCtx, path: VersionedPath, _query: (), body: Bytes)
 	// Process message directly using ops
 	let response = crate::replica::message_request::message_request(&ctx, request).await?;
 
-	versioned::Response::latest(response).serialize(path.version)
+	versioned::Response::wrap_latest(response).serialize(path.version)
 }
diff --git a/engine/packages/epoxy/src/keys/keys.rs b/engine/packages/epoxy/src/keys/keys.rs
index bbe22ba2d3..2e8ccf1729 100644
--- a/engine/packages/epoxy/src/keys/keys.rs
+++ b/engine/packages/epoxy/src/keys/keys.rs
@@ -1,5 +1,4 @@
 use anyhow::*;
-use epoxy_protocol::protocol::ReplicaId;
 use std::result::Result::Ok;
 use universaldb::prelude::*;
 
diff --git a/engine/packages/epoxy/src/keys/replica.rs b/engine/packages/epoxy/src/keys/replica.rs
index e65788d3a0..1bd6f95997 100644
--- a/engine/packages/epoxy/src/keys/replica.rs
+++ b/engine/packages/epoxy/src/keys/replica.rs
@@ -52,7 +52,7 @@ impl FormalKey for LogEntryKey {
 	}
 
 	fn serialize(&self, value: Self::Value) -> Result<Vec<u8>> {
-		epoxy_protocol::versioned::LogEntry::latest(value)
+		epoxy_protocol::versioned::LogEntry::wrap_latest(value)
 			.serialize_with_embedded_version(epoxy_protocol::PROTOCOL_VERSION)
 	}
 }
@@ -162,7 +162,7 @@ impl FormalKey for ConfigKey {
 	}
 
 	fn serialize(&self, value: Self::Value) -> Result<Vec<u8>> {
-		epoxy_protocol::versioned::ClusterConfig::latest(value)
+		epoxy_protocol::versioned::ClusterConfig::wrap_latest(value)
 			.serialize_with_embedded_version(epoxy_protocol::PROTOCOL_VERSION)
 	}
 }
@@ -223,7 +223,7 @@ impl FormalKey for CurrentBallotKey {
 	}
 
 	fn serialize(&self, value: Self::Value) -> Result<Vec<u8>> {
-		epoxy_protocol::versioned::Ballot::latest(value)
+		epoxy_protocol::versioned::Ballot::wrap_latest(value)
 			.serialize_with_embedded_version(epoxy_protocol::PROTOCOL_VERSION)
 	}
 }
@@ -262,7 +262,7 @@ impl FormalKey for InstanceBallotKey {
 	}
 
 	fn serialize(&self, value: Self::Value) -> Result<Vec<u8>> {
-		epoxy_protocol::versioned::Ballot::latest(value)
+		epoxy_protocol::versioned::Ballot::wrap_latest(value)
 			.serialize_with_embedded_version(epoxy_protocol::PROTOCOL_VERSION)
 	}
 }
diff --git a/engine/packages/epoxy/src/ops/explicit_prepare.rs b/engine/packages/epoxy/src/ops/explicit_prepare.rs
index e821303eff..2445f9d034 100644
--- a/engine/packages/epoxy/src/ops/explicit_prepare.rs
+++ b/engine/packages/epoxy/src/ops/explicit_prepare.rs
@@ -3,7 +3,7 @@ use epoxy_protocol::protocol::{self, ReplicaId};
 use gas::prelude::*;
 use rivet_api_builder::ApiCtx;
 
-use crate::{http_client, replica, types, utils};
+use crate::{http_client, replica, utils};
 
 #[derive(Debug)]
 pub struct Input {
@@ -79,15 +79,8 @@ pub async fn epoxy_explicit_prepare(
 	let result = match analyze_prepare_responses(&highest_ballot_responses, instance) {
 		PrepareDecision::Commit(payload) => {
 			// EPaxos Step 29: Run Commit phase
-			let result = crate::ops::propose::commit(
-				ctx,
-				&config,
-				replica_id,
-				&quorum_members,
-				payload,
-				false,
-			)
-			.await?;
+			let result =
+				crate::ops::propose::commit(ctx, &config, replica_id, payload, false).await?;
 			convert_proposal_result(result)
 		}
 		PrepareDecision::Accept(payload) => {
diff --git a/engine/packages/epoxy/src/ops/kv/get_local.rs b/engine/packages/epoxy/src/ops/kv/get_local.rs
index 64b3f95116..3df54bdee1 100644
--- a/engine/packages/epoxy/src/ops/kv/get_local.rs
+++ b/engine/packages/epoxy/src/ops/kv/get_local.rs
@@ -1,7 +1,6 @@
 use anyhow::*;
 use epoxy_protocol::protocol::ReplicaId;
 use gas::prelude::*;
-use rivet_api_builder::prelude::*;
 use universaldb::utils::{FormalKey, IsolationLevel::*};
 
 use crate::keys;
@@ -30,15 +29,12 @@ pub async fn epoxy_kv_get_local(ctx: &OperationCtx, input: &Input) -> Result<Out
 			let packed_key = packed_key.clone();
 			let kv_key = kv_key.clone();
 			async move {
-				(async move {
-					let value = tx.get(&packed_key, Serializable).await?;
-					if let Some(v) = value {
-						Ok(Some(kv_key.deserialize(&v)?))
-					} else {
-						Ok(None)
-					}
-				})
-				.await
+				let value = tx.get(&packed_key, Serializable).await?;
+				if let Some(v) = value {
+					Ok(Some(kv_key.deserialize(&v)?))
+				} else {
+					Ok(None)
+				}
 			}
 		})
 		.custom_instrument(tracing::info_span!("get_local_tx"))
diff --git a/engine/packages/epoxy/src/ops/kv/get_optimistic.rs b/engine/packages/epoxy/src/ops/kv/get_optimistic.rs
index a05a84dd89..b191c47a92 100644
--- a/engine/packages/epoxy/src/ops/kv/get_optimistic.rs
+++ b/engine/packages/epoxy/src/ops/kv/get_optimistic.rs
@@ -52,29 +52,26 @@ pub async fn epoxy_kv_get_optimistic(ctx: &OperationCtx, input: &Input) -> Resul
 			let kv_key = kv_key.clone();
 			let cache_key = cache_key.clone();
 			async move {
-				(async move {
-					let (value, cache_value) = tokio::try_join!(
-						async {
-							let v = tx.get(&packed_key, Serializable).await?;
-							if let Some(ref bytes) = v {
-								Ok(Some(kv_key.deserialize(bytes)?))
-							} else {
-								Ok(None)
-							}
-						},
-						async {
-							let v = tx.get(&packed_cache_key, Serializable).await?;
-							if let Some(ref bytes) = v {
-								Ok(Some(cache_key.deserialize(bytes)?))
-							} else {
-								Ok(None)
-							}
+				let (value, cache_value) = tokio::try_join!(
+					async {
+						let v = tx.get(&packed_key, Serializable).await?;
+						if let Some(ref bytes) = v {
+							Ok(Some(kv_key.deserialize(bytes)?))
+						} else {
+							Ok(None)
 						}
-					)?;
+					},
+					async {
+						let v = tx.get(&packed_cache_key, Serializable).await?;
+						if let Some(ref bytes) = v {
+							Ok(Some(cache_key.deserialize(bytes)?))
+						} else {
+							Ok(None)
+						}
+					}
+				)?;
 
-					Ok(value.or(cache_value))
-				})
-				.await
+				Ok(value.or(cache_value))
 			}
 		})
 		.custom_instrument(tracing::info_span!("get_optimistic_tx"))
@@ -134,13 +131,11 @@ pub async fn epoxy_kv_get_optimistic(ctx: &OperationCtx, input: &Input) -> Resul
 					let packed_cache_key = packed_cache_key.clone();
 					let cache_key = cache_key.clone();
 					let value_to_cache = value.clone();
+
 					async move {
-						(async move {
-							let serialized = cache_key.serialize(value_to_cache)?;
-							tx.set(&packed_cache_key, &serialized);
-							Ok(())
-						})
-						.await
+						let serialized = cache_key.serialize(value_to_cache)?;
+						tx.set(&packed_cache_key, &serialized);
+						Ok(())
 					}
 				})
 				.custom_instrument(tracing::info_span!("cache_value_tx"))
diff --git a/engine/packages/epoxy/src/ops/kv/mod.rs b/engine/packages/epoxy/src/ops/kv/mod.rs
index e30c032b35..aac85283e3 100644
--- a/engine/packages/epoxy/src/ops/kv/mod.rs
+++ b/engine/packages/epoxy/src/ops/kv/mod.rs
@@ -1,2 +1,3 @@
 pub mod get_local;
 pub mod get_optimistic;
+pub mod purge_local;
diff --git a/engine/packages/epoxy/src/ops/kv/purge_local.rs b/engine/packages/epoxy/src/ops/kv/purge_local.rs
new file mode 100644
index 0000000000..c1c2b6731e
--- /dev/null
+++ b/engine/packages/epoxy/src/ops/kv/purge_local.rs
@@ -0,0 +1,28 @@
+use anyhow::*;
+use epoxy_protocol::protocol::ReplicaId;
+use gas::prelude::*;
+
+use crate::keys;
+
+#[derive(Debug)]
+pub struct Input {
+	pub replica_id: ReplicaId,
+	pub keys: Vec<Vec<u8>>,
+}
+
+#[operation]
+pub async fn epoxy_kv_purge_local(ctx: &OperationCtx, input: &Input) -> Result<()> {
+	ctx.udb()?
+		.run(|tx| async move {
+			let tx = tx.with_subspace(keys::subspace(input.replica_id));
+
+			for key in &input.keys {
+				tx.delete(&keys::keys::KvOptimisticCacheKey::new(key.clone()));
+			}
+
+			Ok(())
+		})
+		.await?;
+
+	Ok(())
+}
diff --git a/engine/packages/epoxy/src/ops/propose.rs b/engine/packages/epoxy/src/ops/propose.rs
index eea9bcda6f..3435aa97d8 100644
--- a/engine/packages/epoxy/src/ops/propose.rs
+++ b/engine/packages/epoxy/src/ops/propose.rs
@@ -1,8 +1,9 @@
 use anyhow::*;
+use base64::Engine;
+use base64::engine::general_purpose::STANDARD as BASE64;
 use epoxy_protocol::protocol::{self, Path, Payload, ReplicaId};
 use gas::prelude::*;
 use rivet_api_builder::prelude::*;
-use rivet_config::Config;
 
 use crate::{http_client, replica, utils};
 
@@ -69,15 +70,7 @@ pub async fn epoxy_propose(ctx: &OperationCtx, input: &Input) -> Result<Proposal
 
 	match path {
 		Path::PathFast(protocol::PathFast { payload }) => {
-			commit(
-				ctx,
-				&config,
-				replica_id,
-				&quorum_members,
-				payload,
-				input.purge_cache,
-			)
-			.await
+			commit(ctx, &config, replica_id, payload, input.purge_cache).await
 		}
 		Path::PathSlow(protocol::PathSlow { payload }) => {
 			run_paxos_accept(
@@ -126,15 +119,7 @@ pub async fn run_paxos_accept(
 
 	// EPaxos Step 20
 	if quorum >= utils::calculate_quorum(quorum_members.len(), utils::QuorumType::Slow) {
-		commit(
-			ctx,
-			&config,
-			replica_id,
-			&quorum_members,
-			payload_for_accepts,
-			purge_cache,
-		)
-		.await
+		commit(ctx, &config, replica_id, payload_for_accepts, purge_cache).await
 	} else {
 		Ok(ProposalResult::ConsensusFailed)
 	}
@@ -145,7 +130,6 @@ pub async fn commit(
 	ctx: &OperationCtx,
 	config: &protocol::ClusterConfig,
 	replica_id: ReplicaId,
-	quorum_members: &[ReplicaId],
 	payload: Payload,
 	purge_cache: bool,
 ) -> Result<ProposalResult> {
@@ -184,6 +168,27 @@ pub async fn commit(
 		}
 	});
 
+	if purge_cache {
+		let keys = payload
+			.proposal
+			.commands
+			.iter()
+			.map(replica::utils::extract_key_from_command)
+			.flatten()
+			.map(|key| BASE64.encode(key))
+			.collect::<Vec<_>>();
+
+		// Purge optimistic cache for all dcs
+		if !keys.is_empty() {
+			let ctx = ctx.clone();
+			tokio::spawn(async move {
+				if let Err(err) = purge_optimistic_cache(ctx, keys).await {
+					tracing::error!(?err, "failed purging optimistic cache");
+				}
+			});
+		}
+	}
+
 	if let Some(cmd_err) = cmd_err {
 		Ok(ProposalResult::CommandError(cmd_err))
 	} else {
@@ -326,3 +331,22 @@ async fn send_commits(
 
 	Ok(())
 }
+
+async fn purge_optimistic_cache(ctx: OperationCtx, keys: Vec<String>) -> Result<()> {
+	for dc in &ctx.config().topology().datacenters {
+		let workflow_id = ctx
+			.workflow(crate::workflows::purger::Input {
+				replica_id: dc.datacenter_label as u64,
+			})
+			.tag("replica_id", dc.datacenter_label as u64)
+			.unique()
+			.dispatch()
+			.await?;
+		ctx.signal(crate::workflows::purger::Purge { keys: keys.clone() })
+			.to_workflow_id(workflow_id)
+			.send()
+			.await?;
+	}
+
+	Ok(())
+}
diff --git a/engine/packages/epoxy/src/ops/read_cluster_config.rs b/engine/packages/epoxy/src/ops/read_cluster_config.rs
index 62a23c2092..366d1062b6 100644
--- a/engine/packages/epoxy/src/ops/read_cluster_config.rs
+++ b/engine/packages/epoxy/src/ops/read_cluster_config.rs
@@ -1,5 +1,5 @@
 use anyhow::*;
-use epoxy_protocol::protocol::{self, ReplicaId};
+use epoxy_protocol::protocol::{self};
 use gas::prelude::*;
 
 use crate::utils;
diff --git a/engine/packages/epoxy/src/replica/lead_consensus.rs b/engine/packages/epoxy/src/replica/lead_consensus.rs
index 8103e78868..5af4edda10 100644
--- a/engine/packages/epoxy/src/replica/lead_consensus.rs
+++ b/engine/packages/epoxy/src/replica/lead_consensus.rs
@@ -4,7 +4,7 @@ use universaldb::Transaction;
 use universaldb::utils::{FormalKey, IsolationLevel::*};
 
 use crate::keys;
-use crate::replica::{ballot, messages, utils};
+use crate::replica::{ballot, utils};
 
 #[tracing::instrument(skip_all)]
 pub async fn lead_consensus(
diff --git a/engine/packages/epoxy/src/replica/message_request.rs b/engine/packages/epoxy/src/replica/message_request.rs
index 4e41d1abf1..7466c3331c 100644
--- a/engine/packages/epoxy/src/replica/message_request.rs
+++ b/engine/packages/epoxy/src/replica/message_request.rs
@@ -1,5 +1,5 @@
 use anyhow::*;
-use epoxy_protocol::protocol::{self, ReplicaId};
+use epoxy_protocol::protocol::{self};
 use gas::prelude::*;
 use rivet_api_builder::prelude::*;
 
@@ -150,6 +150,16 @@ pub async fn message_request(
 				value: result.value,
 			})
 		}
+		protocol::RequestKind::KvPurgeRequest(req) => {
+			// Handle KV purge request
+			ctx.op(ops::kv::purge_local::Input {
+				replica_id: current_replica_id,
+				keys: req.keys.clone(),
+			})
+			.await?;
+
+			protocol::ResponseKind::KvPurgeResponse
+		}
 	};
 
 	Ok(protocol::Response { kind })
diff --git a/engine/packages/epoxy/src/replica/messages/accept.rs b/engine/packages/epoxy/src/replica/messages/accept.rs
index f123796d0d..af1588dd1f 100644
--- a/engine/packages/epoxy/src/replica/messages/accept.rs
+++ b/engine/packages/epoxy/src/replica/messages/accept.rs
@@ -2,7 +2,7 @@ use anyhow::{Result, ensure};
 use epoxy_protocol::protocol;
 use universaldb::Transaction;
 
-use crate::replica::{ballot, messages};
+use crate::replica::ballot;
 
 #[tracing::instrument(skip_all)]
 pub async fn accept(
diff --git a/engine/packages/epoxy/src/replica/messages/accepted.rs b/engine/packages/epoxy/src/replica/messages/accepted.rs
index bcbb0a0147..b91d5e24e4 100644
--- a/engine/packages/epoxy/src/replica/messages/accepted.rs
+++ b/engine/packages/epoxy/src/replica/messages/accepted.rs
@@ -2,7 +2,7 @@ use anyhow::Result;
 use epoxy_protocol::protocol;
 use universaldb::Transaction;
 
-use crate::replica::{ballot, messages, utils};
+use crate::replica::ballot;
 
 // EPaxos Step 16
 #[tracing::instrument(skip_all)]
diff --git a/engine/packages/epoxy/src/replica/messages/pre_accept.rs b/engine/packages/epoxy/src/replica/messages/pre_accept.rs
index 452f9d51ba..a365a2e1be 100644
--- a/engine/packages/epoxy/src/replica/messages/pre_accept.rs
+++ b/engine/packages/epoxy/src/replica/messages/pre_accept.rs
@@ -3,7 +3,7 @@ use epoxy_protocol::protocol;
 use std::cmp;
 use universaldb::Transaction;
 
-use crate::replica::{ballot, messages, utils};
+use crate::replica::{ballot, utils};
 
 #[tracing::instrument(skip_all)]
 pub async fn pre_accept(
diff --git a/engine/packages/epoxy/src/workflows/coordinator/reconfigure.rs b/engine/packages/epoxy/src/workflows/coordinator/reconfigure.rs
index 4d510805b8..fbc5ccca51 100644
--- a/engine/packages/epoxy/src/workflows/coordinator/reconfigure.rs
+++ b/engine/packages/epoxy/src/workflows/coordinator/reconfigure.rs
@@ -1,5 +1,5 @@
 use anyhow::*;
-use epoxy_protocol::protocol::{self, ReplicaId};
+use epoxy_protocol::protocol::{self};
 use gas::prelude::*;
 use rivet_api_builder::ApiCtx;
 use serde::{Deserialize, Serialize};
diff --git a/engine/packages/epoxy/src/workflows/mod.rs b/engine/packages/epoxy/src/workflows/mod.rs
index ecc638cdb8..c4270f75e9 100644
--- a/engine/packages/epoxy/src/workflows/mod.rs
+++ b/engine/packages/epoxy/src/workflows/mod.rs
@@ -1,2 +1,3 @@
 pub mod coordinator;
+pub mod purger;
 pub mod replica;
diff --git a/engine/packages/epoxy/src/workflows/purger.rs b/engine/packages/epoxy/src/workflows/purger.rs
new file mode 100644
index 0000000000..f68339b349
--- /dev/null
+++ b/engine/packages/epoxy/src/workflows/purger.rs
@@ -0,0 +1,81 @@
+use anyhow::*;
+use base64::Engine;
+use base64::engine::general_purpose::STANDARD as BASE64;
+use epoxy_protocol::protocol;
+use futures_util::FutureExt;
+use gas::prelude::*;
+use rivet_api_builder::ApiCtx;
+use serde::{Deserialize, Serialize};
+
+use crate::http_client;
+
+#[derive(Debug, Deserialize, Serialize)]
+pub struct Input {
+	pub replica_id: protocol::ReplicaId,
+}
+
+// HACK: This workflow is a hack used to implement token revoking. It should be replaced with proper snapshot
+// reads
+#[workflow]
+pub async fn epoxy_purger(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> {
+	ctx.repeat(|ctx| {
+		let replica_id = input.replica_id;
+
+		async move {
+			let sig = ctx.listen::<Purge>().await?;
+
+			ctx.activity(PurgeInput {
+				replica_id,
+				keys: sig.keys,
+			})
+			.await?;
+
+			Ok(Loop::<()>::Continue)
+		}
+		.boxed()
+	})
+	.await?;
+
+	Ok(())
+}
+
+#[signal("epoxy_purger_purge")]
+pub struct Purge {
+	/// Base64 encoded keys.
+	pub keys: Vec<String>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Hash)]
+struct PurgeInput {
+	replica_id: protocol::ReplicaId,
+	/// Base64 encoded keys.
+	keys: Vec<String>,
+}
+
+#[activity(PurgeActivity)]
+#[max_retries = 18_446_744_073_709_551_615] // Retry forever
+async fn send_purge(ctx: &ActivityCtx, input: &PurgeInput) -> Result<()> {
+	let config = ctx
+		.op(crate::ops::read_cluster_config::Input {})
+		.await?
+		.config;
+
+	http_client::send_message(
+		&ApiCtx::new_from_activity(&ctx)?,
+		&config,
+		protocol::Request {
+			from_replica_id: ctx.config().epoxy_replica_id(),
+			to_replica_id: input.replica_id,
+			kind: protocol::RequestKind::KvPurgeRequest(protocol::KvPurgeRequest {
+				keys: input
+					.keys
+					.iter()
+					.map(|key| BASE64.decode(key).context("invalid base64 key"))
+					.collect::<Result<Vec<_>>>()?,
+			}),
+		},
+	)
+	.await?;
+
+	Ok(())
+}
diff --git a/engine/packages/epoxy/src/workflows/replica/mod.rs b/engine/packages/epoxy/src/workflows/replica/mod.rs
index 36a29cfbde..a284eab60b 100644
--- a/engine/packages/epoxy/src/workflows/replica/mod.rs
+++ b/engine/packages/epoxy/src/workflows/replica/mod.rs
@@ -1,5 +1,4 @@
 use anyhow::*;
-use epoxy_protocol::protocol;
 use futures_util::FutureExt;
 use gas::prelude::*;
 use serde::{Deserialize, Serialize};
diff --git a/engine/packages/epoxy/src/workflows/replica/setup.rs b/engine/packages/epoxy/src/workflows/replica/setup.rs
index 911be58f67..6022f5547c 100644
--- a/engine/packages/epoxy/src/workflows/replica/setup.rs
+++ b/engine/packages/epoxy/src/workflows/replica/setup.rs
@@ -675,7 +675,6 @@ async fn recover_key_value_with_instances(
 				committed_entries.push(CommittedEntry {
 					instance: (*instance_replica_id, *instance_slot_id),
 					entry: entry.clone(),
-					seq: entry.seq,
 					deps: entry.deps.clone(),
 				});
 			}
@@ -729,7 +728,7 @@ async fn recover_key_value_with_instances(
 struct CommittedEntry {
 	instance: (protocol::ReplicaId, protocol::SlotId),
 	entry: protocol::LogEntry,
-	seq: u64, // Seq is u64 in protocol
+	// seq: u64, // Seq is u64 in protocol
 	deps: Vec<protocol::Instance>,
 }
 
diff --git a/engine/packages/gasoline/src/builder/common/signal.rs b/engine/packages/gasoline/src/builder/common/signal.rs
index 7d601ab861..86f03fa878 100644
--- a/engine/packages/gasoline/src/builder/common/signal.rs
+++ b/engine/packages/gasoline/src/builder/common/signal.rs
@@ -43,6 +43,7 @@ impl<T: Signal + Serialize> SignalBuilder<T> {
 
 	// TODO: Get rid of this
 	// NOTE: This is a bad implementation because it disregards other errors that may have happened earlier
+	#[allow(non_snake_case)]
 	pub fn bypass_signal_from_workflow_I_KNOW_WHAT_IM_DOING(mut self) -> Self {
 		if let Some(BuilderError::CannotDispatchFromOpInWorkflow) = &self.error {
 			self.error = None;
diff --git a/engine/packages/gasoline/src/ctx/standalone.rs b/engine/packages/gasoline/src/ctx/standalone.rs
index 25e08c796e..222e764618 100644
--- a/engine/packages/gasoline/src/ctx/standalone.rs
+++ b/engine/packages/gasoline/src/ctx/standalone.rs
@@ -46,9 +46,9 @@ impl StandaloneCtx {
 	) -> WorkflowResult<Self> {
 		let ts = rivet_util::timestamp::now();
 
-		let span = tracing::Span::current();
-		span.record("req_id", req_id.to_string());
-		span.record("ray_id", ray_id.to_string());
+		tracing::Span::current()
+			.record("req_id", req_id.to_string())
+			.record("ray_id", ray_id.to_string());
 
 		let msg_ctx = MessageCtx::new(&config, &pools, &cache, ray_id)?;
 
diff --git a/engine/packages/gasoline/src/db/kv/mod.rs b/engine/packages/gasoline/src/db/kv/mod.rs
index 9bc6c0c095..debb197da8 100644
--- a/engine/packages/gasoline/src/db/kv/mod.rs
+++ b/engine/packages/gasoline/src/db/kv/mod.rs
@@ -2532,7 +2532,7 @@ impl Database for DatabaseKv {
 
 				Ok(())
 			})
-			.custom_instrument(tracing::info_span!("commit_workflow_sleep_event_tx"))
+			.custom_instrument(tracing::info_span!("upsert_loop_event_tx"))
 			.await
 			.map_err(WorkflowError::Udb)?;
 
@@ -2593,7 +2593,7 @@ impl Database for DatabaseKv {
 
 				Ok(())
 			})
-			.custom_instrument(tracing::info_span!("update_workflow_sleep_event_tx"))
+			.custom_instrument(tracing::info_span!("update_workflow_sleep_state_tx"))
 			.await
 			.map_err(WorkflowError::Udb)?;
 
diff --git a/engine/packages/guard-core/src/custom_serve.rs b/engine/packages/guard-core/src/custom_serve.rs
index 3d54fdaeaa..351747e96d 100644
--- a/engine/packages/guard-core/src/custom_serve.rs
+++ b/engine/packages/guard-core/src/custom_serve.rs
@@ -3,6 +3,8 @@ use async_trait::async_trait;
 use bytes::Bytes;
 use http_body_util::Full;
 use hyper::{Request, Response};
+use tokio_tungstenite::tungstenite::protocol::frame::CloseFrame;
+use uuid::Uuid;
 
 use crate::WebSocketHandle;
 use crate::proxy_service::ResponseBody;
@@ -25,5 +27,7 @@ pub trait CustomServeTrait: Send + Sync {
 		headers: &hyper::HeaderMap,
 		path: &str,
 		request_context: &mut RequestContext,
-	) -> Result<()>;
+		// Identifies the websocket across retries.
+		unique_request_id: Uuid,
+	) -> Result<Option<CloseFrame>>;
 }
diff --git a/engine/packages/guard-core/src/errors.rs b/engine/packages/guard-core/src/errors.rs
index f45b47c0ce..c40d17a8dc 100644
--- a/engine/packages/guard-core/src/errors.rs
+++ b/engine/packages/guard-core/src/errors.rs
@@ -81,6 +81,14 @@ pub struct ServiceUnavailable;
 )]
 pub struct WebSocketServiceUnavailable;
 
+#[derive(RivetError, Serialize, Deserialize)]
+#[error("guard", "websocket_service_retry", "WebSocket service retry.")]
+pub struct WebSocketServiceRetry;
+
+#[derive(RivetError, Serialize, Deserialize)]
+#[error("guard", "websocket_service_timeout", "WebSocket service timed out.")]
+pub struct WebSocketServiceTimeout;
+
 #[derive(RivetError, Serialize, Deserialize)]
 #[error(
 	"guard",
diff --git a/engine/packages/guard-core/src/proxy_service.rs b/engine/packages/guard-core/src/proxy_service.rs
index f4498099f3..cacc8be35a 100644
--- a/engine/packages/guard-core/src/proxy_service.rs
+++ b/engine/packages/guard-core/src/proxy_service.rs
@@ -28,14 +28,19 @@ use tokio_tungstenite::tungstenite::{
 };
 use tracing::Instrument;
 use url::Url;
+use uuid::Uuid;
 
 use crate::{
 	WebSocketHandle, custom_serve::CustomServeTrait, errors, metrics,
 	request_context::RequestContext,
 };
 
+const X_RIVET_TARGET: HeaderName = HeaderName::from_static("x-rivet-target");
+const X_RIVET_ACTOR: HeaderName = HeaderName::from_static("x-rivet-actor");
+const X_RIVET_TOKEN: HeaderName = HeaderName::from_static("x-rivet-token");
 pub const X_FORWARDED_FOR: HeaderName = HeaderName::from_static("x-forwarded-for");
 pub const X_RIVET_ERROR: HeaderName = HeaderName::from_static("x-rivet-error");
+
 const ROUTE_CACHE_TTL: Duration = Duration::from_secs(60 * 10); // 10 minutes
 const PROXY_STATE_CACHE_TTL: Duration = Duration::from_secs(60 * 60); // 1 hour
 const WEBSOCKET_CLOSE_LINGER: Duration = Duration::from_millis(100); // Keep TCP connection open briefly after WebSocket close
@@ -879,7 +884,7 @@ impl ProxyService {
 					match res {
 						Ok(resp) => {
 							// Check if this is a retryable response
-							if should_retry(resp.status(), resp.headers()) {
+							if should_retry_request_inner(resp.status(), resp.headers()) {
 								// Request connect error, might retry
 								tracing::debug!(
 									"Request attempt {attempts} failed (service unavailable)"
@@ -1017,10 +1022,10 @@ impl ProxyService {
 				while attempts < max_attempts {
 					attempts += 1;
 
-					let resp = handler
+					let res = handler
 						.handle_request(req_collected.clone(), request_context)
-						.await?;
-					if should_retry(resp.status(), resp.headers()) {
+						.await;
+					if should_retry_request(&res) {
 						// Request connect error, might retry
 						tracing::debug!("Request attempt {attempts} failed (service unavailable)");
 
@@ -1047,7 +1052,7 @@ impl ProxyService {
 						continue;
 					}
 
-					return Ok(resp);
+					return res;
 				}
 
 				// If we get here, all attempts failed
@@ -1059,6 +1064,7 @@ impl ProxyService {
 		}
 	}
 
+	/// Modifies the incoming request before it is proxied.
 	fn proxied_request_builder(
 		&self,
 		req_parts: &hyper::http::request::Parts,
@@ -1088,13 +1094,16 @@ impl ProxyService {
 			.method(req_parts.method.clone())
 			.uri(url.to_string());
 
-		// Add proxy headers
-		{
-			let headers = builder
-				.headers_mut()
-				.expect("request builder unexpectedly in error state");
-			add_proxy_headers_with_addr(headers, &req_parts.headers, self.remote_addr)?;
-		}
+		// Modify proxy headers
+		let headers = builder
+			.headers_mut()
+			.expect("request builder unexpectedly in error state");
+
+		headers.remove(X_RIVET_TARGET);
+		headers.remove(X_RIVET_ACTOR);
+		headers.remove(X_RIVET_TOKEN);
+
+		add_proxy_headers_with_addr(headers, &req_parts.headers, self.remote_addr)?;
 
 		Ok(builder)
 	}
@@ -1171,7 +1180,7 @@ impl ProxyService {
 		}
 
 		// Handle WebSocket upgrade properly with hyper_tungstenite
-		tracing::debug!("Upgrading client connection to WebSocket");
+		tracing::debug!(%req_path, "Upgrading client connection to WebSocket");
 		let (client_response, client_ws) = match hyper_tungstenite::upgrade(req, None) {
 			Ok(x) => {
 				tracing::debug!("Client WebSocket upgrade successful");
@@ -1782,18 +1791,20 @@ impl ProxyService {
 			}
 			ResolveRouteOutput::Response(_) => unreachable!(),
 			ResolveRouteOutput::CustomServe(mut handlers) => {
-				tracing::debug!("Spawning task to handle WebSocket communication");
+				tracing::debug!(%req_path, "Spawning task to handle WebSocket communication");
 				let mut request_context = request_context.clone();
 				let req_headers = req_headers.clone();
 				let req_path = req_path.clone();
 				let req_host = req_host.clone();
 
-				// TODO: Handle errors here, the error message is lost
 				tokio::spawn(
 					async move {
+						let request_id = Uuid::new_v4();
 						let mut attempts = 0u32;
 
-						let ws_handle = WebSocketHandle::new(client_ws);
+						let ws_handle = WebSocketHandle::new(client_ws)
+							.await
+							.context("failed initiating websocket handle")?;
 
 						loop {
 							match handlers
@@ -1802,19 +1813,15 @@ impl ProxyService {
 									&req_headers,
 									&req_path,
 									&mut request_context,
+									request_id,
 								)
 								.await
 							{
-								Ok(()) => {
+								Ok(close_frame) => {
 									tracing::debug!("websocket handler complete, closing");
 
 									// Send graceful close
-									ws_handle
-										.send(to_hyper_close(Some(CloseFrame {
-											code: CloseCode::Normal,
-											reason: "".into(),
-										})))
-										.await?;
+									ws_handle.send(to_hyper_close(close_frame)).await?;
 
 									// Flush to ensure close frame is sent
 									ws_handle.flush().await?;
@@ -1825,13 +1832,32 @@ impl ProxyService {
 									break;
 								}
 								Err(err) => {
-									attempts += 1;
-									if attempts > max_attempts || !is_retryable_ws_error(&err) {
+									tracing::debug!(?err, "websocket handler error");
+
+									// Denotes that the connection did not fail, but needs to be retried to
+									// resole a new target
+									let ws_retry = is_ws_retry(&err);
+
+									if ws_retry {
+										attempts = 0;
+									} else {
+										attempts += 1;
+									}
+
+									if attempts > max_attempts
+										|| (!is_retryable_ws_error(&err) && !ws_retry)
+									{
+										tracing::debug!(
+											?attempts,
+											?max_attempts,
+											"WebSocket failed"
+										);
+
 										// Close WebSocket with error
 										ws_handle
-											.accept_and_send(to_hyper_close(Some(
-												err_to_close_frame(err, ray_id),
-											)))
+											.send(to_hyper_close(Some(err_to_close_frame(
+												err, ray_id,
+											))))
 											.await?;
 
 										// Flush to ensure close frame is sent
@@ -1842,11 +1868,19 @@ impl ProxyService {
 
 										break;
 									} else {
-										let backoff = ProxyService::calculate_backoff(
-											attempts,
-											initial_interval,
-										);
-										tokio::time::sleep(backoff).await;
+										if !ws_retry {
+											let backoff = ProxyService::calculate_backoff(
+												attempts,
+												initial_interval,
+											);
+
+											tracing::debug!(
+												?backoff,
+												"WebSocket attempt {attempts} failed (service unavailable)"
+											);
+
+											tokio::time::sleep(backoff).await;
+										}
 
 										match state
 											.resolve_route(
@@ -1864,11 +1898,9 @@ impl ProxyService {
 											}
 											Ok(ResolveRouteOutput::Response(response)) => {
 												ws_handle
-													.accept_and_send(to_hyper_close(Some(
-														str_to_close_frame(
-															response.message.as_ref(),
-														),
-													)))
+													.send(to_hyper_close(Some(str_to_close_frame(
+														response.message.as_ref(),
+													))))
 													.await?;
 
 												// Flush to ensure close frame is sent
@@ -1879,12 +1911,10 @@ impl ProxyService {
 											}
 											Ok(ResolveRouteOutput::Target(_)) => {
 												ws_handle
-													.accept_and_send(to_hyper_close(Some(
-														err_to_close_frame(
-															errors::WebSocketTargetChanged.build(),
-															ray_id,
-														),
-													)))
+													.send(to_hyper_close(Some(err_to_close_frame(
+														errors::WebSocketTargetChanged.build(),
+														ray_id,
+													))))
 													.await?;
 
 												// Flush to ensure close frame is sent
@@ -1897,9 +1927,9 @@ impl ProxyService {
 											}
 											Err(err) => {
 												ws_handle
-													.accept_and_send(to_hyper_close(Some(
-														err_to_close_frame(err, ray_id),
-													)))
+													.send(to_hyper_close(Some(err_to_close_frame(
+														err, ray_id,
+													))))
 													.await?;
 
 												// Flush to ensure close frame is sent
@@ -1947,13 +1977,17 @@ impl ProxyService {
 
 impl ProxyService {
 	// Process an individual request
-	#[tracing::instrument(name = "guard_request", skip_all)]
+	#[tracing::instrument(name = "guard_request", skip_all, fields(ray_id, req_id))]
 	pub async fn process(&self, mut req: Request<BodyIncoming>) -> Result<Response<ResponseBody>> {
 		let start_time = Instant::now();
 
 		let request_ids = RequestIds::new(self.state.config.dc_label());
 		req.extensions_mut().insert(request_ids);
 
+		tracing::Span::current()
+			.record("req_id", request_ids.req_id.to_string())
+			.record("ray_id", request_ids.ray_id.to_string());
+
 		// Create request context for analytics tracking
 		let mut request_context =
 			RequestContext::new(self.state.clickhouse_inserter.clone(), request_ids);
@@ -2063,35 +2097,50 @@ impl ProxyService {
 
 				// If we receive an error during a websocket request, we attempt to open the websocket anyway
 				// so we can send the error via websocket instead of http. Most websocket clients don't handle
-				// HTTP errors in a meaningful way for the user resulting in unhelpful errors
+				// HTTP errors in a meaningful way resulting in unhelpful errors for the user
 				if is_websocket {
 					tracing::debug!("Upgrading client connection to WebSocket for error proxy");
 					match hyper_tungstenite::upgrade(mock_req, None) {
 						Ok((client_response, client_ws)) => {
 							tracing::debug!("Client WebSocket upgrade for error proxy successful");
 
-							tokio::spawn(async move {
-								let ws_handle = WebSocketHandle::new(client_ws);
-								let frame = err_to_close_frame(err, Some(request_ids.ray_id));
+							tokio::spawn(
+								async move {
+									let ws_handle = match WebSocketHandle::new(client_ws).await {
+										Ok(ws_handle) => ws_handle,
+										Err(err) => {
+											tracing::debug!(
+												?err,
+												"failed initiating websocket handle for error proxy"
+											);
+											return;
+										}
+									};
+									let frame = err_to_close_frame(err, Some(request_ids.ray_id));
 
-								// Manual conversion to handle different tungstenite versions
-								let code_num: u16 = frame.code.into();
-								let reason = frame.reason.clone();
+									// Manual conversion to handle different tungstenite versions
+									let code_num: u16 = frame.code.into();
+									let reason = frame.reason.clone();
 
-								if let Err(err) = ws_handle
-									.accept_and_send(
-										tokio_tungstenite::tungstenite::Message::Close(Some(
+									if let Err(err) = ws_handle
+										.send(tokio_tungstenite::tungstenite::Message::Close(Some(
 											tokio_tungstenite::tungstenite::protocol::CloseFrame {
 												code: code_num.into(),
 												reason,
 											},
-										)),
-									)
-									.await
-								{
-									tracing::debug!(?err, "failed sending error proxy");
+										)))
+										.await
+									{
+										tracing::debug!(
+											?err,
+											"failed sending websocket error proxy"
+										);
+									}
 								}
-							});
+								.instrument(
+									tracing::info_span!("ws_error_proxy_task", ?request_ids.ray_id),
+								),
+							);
 
 							// Return the response that will upgrade the client connection
 							// For proper WebSocket handshaking, we need to preserve the original response
@@ -2371,8 +2420,21 @@ fn err_into_response(err: anyhow::Error) -> Result<Response<ResponseBody>> {
 		.map_err(Into::into)
 }
 
+fn should_retry_request(res: &Result<Response<ResponseBody>>) -> bool {
+	match res {
+		Ok(resp) => should_retry_request_inner(resp.status(), resp.headers()),
+		Err(err) => {
+			if let Some(rivet_err) = err.chain().find_map(|x| x.downcast_ref::<RivetError>()) {
+				rivet_err.group() == "guard" && rivet_err.code() == "service_unavailable"
+			} else {
+				false
+			}
+		}
+	}
+}
+
 // Determine if a response should trigger a retry: 503 + x-rivet-error
-fn should_retry(status: StatusCode, headers: &hyper::HeaderMap) -> bool {
+fn should_retry_request_inner(status: StatusCode, headers: &hyper::HeaderMap) -> bool {
 	status == StatusCode::SERVICE_UNAVAILABLE && headers.contains_key(X_RIVET_ERROR)
 }
 
@@ -2385,6 +2447,14 @@ fn is_retryable_ws_error(err: &anyhow::Error) -> bool {
 	}
 }
 
+fn is_ws_retry(err: &anyhow::Error) -> bool {
+	if let Some(rivet_err) = err.chain().find_map(|x| x.downcast_ref::<RivetError>()) {
+		rivet_err.group() == "guard" && rivet_err.code() == "websocket_service_retry"
+	} else {
+		false
+	}
+}
+
 fn str_to_close_frame(err: &str) -> CloseFrame {
 	// NOTE: reason cannot be more than 123 bytes as per the WS protocol spec
 	let reason = rivet_util::safe_slice(err, 0, 123).into();
diff --git a/engine/packages/guard-core/src/websocket_handle.rs b/engine/packages/guard-core/src/websocket_handle.rs
index bb17d2df3b..763f337b20 100644
--- a/engine/packages/guard-core/src/websocket_handle.rs
+++ b/engine/packages/guard-core/src/websocket_handle.rs
@@ -4,7 +4,6 @@ use hyper::upgrade::Upgraded;
 use hyper_tungstenite::HyperWebsocket;
 use hyper_tungstenite::tungstenite::Message as WsMessage;
 use hyper_util::rt::TokioIo;
-use std::ops::Deref;
 use std::sync::Arc;
 use tokio::sync::Mutex;
 use tokio_tungstenite::WebSocketStream;
@@ -14,104 +13,34 @@ pub type WebSocketReceiver = futures_util::stream::SplitStream<WebSocketStream<T
 pub type WebSocketSender =
 	futures_util::stream::SplitSink<WebSocketStream<TokioIo<Upgraded>>, WsMessage>;
 
-enum WebSocketState {
-	Unaccepted { websocket: HyperWebsocket },
-	Accepting,
-	Split { ws_tx: WebSocketSender },
-}
-
 #[derive(Clone)]
-pub struct WebSocketHandle(Arc<WebSocketHandleInner>);
-
-impl WebSocketHandle {
-	pub fn new(websocket: HyperWebsocket) -> Self {
-		Self(Arc::new(WebSocketHandleInner {
-			state: Mutex::new(WebSocketState::Unaccepted { websocket }),
-		}))
-	}
+pub struct WebSocketHandle {
+	ws_tx: Arc<Mutex<WebSocketSender>>,
+	ws_rx: Arc<Mutex<WebSocketReceiver>>,
 }
 
-impl Deref for WebSocketHandle {
-	type Target = WebSocketHandleInner;
-
-	fn deref(&self) -> &Self::Target {
-		&*self.0
-	}
-}
-
-pub struct WebSocketHandleInner {
-	state: Mutex<WebSocketState>,
-}
+impl WebSocketHandle {
+	pub async fn new(websocket: HyperWebsocket) -> Result<Self> {
+		let ws_stream = websocket.await?;
+		let (ws_tx, ws_rx) = ws_stream.split();
 
-impl WebSocketHandleInner {
-	pub async fn accept(&self) -> Result<WebSocketReceiver> {
-		let mut state = self.state.lock().await;
-		Self::accept_inner(&mut *state).await
+		Ok(Self {
+			ws_tx: Arc::new(Mutex::new(ws_tx)),
+			ws_rx: Arc::new(Mutex::new(ws_rx)),
+		})
 	}
 
 	pub async fn send(&self, message: WsMessage) -> Result<()> {
-		let mut state = self.state.lock().await;
-		match &mut *state {
-			WebSocketState::Unaccepted { .. } | WebSocketState::Accepting => {
-				bail!("websocket has not been accepted");
-			}
-			WebSocketState::Split { ws_tx } => {
-				ws_tx.send(message).await?;
-				Ok(())
-			}
-		}
-	}
-
-	pub async fn accept_and_send(&self, message: WsMessage) -> Result<()> {
-		let mut state = self.state.lock().await;
-		match &mut *state {
-			WebSocketState::Unaccepted { .. } => {
-				let _ = Self::accept_inner(&mut *state).await?;
-				let WebSocketState::Split { ws_tx } = &mut *state else {
-					bail!("websocket should be accepted");
-				};
-				ws_tx.send(message).await?;
-				Ok(())
-			}
-			WebSocketState::Accepting => {
-				bail!("in accepting state")
-			}
-			WebSocketState::Split { ws_tx } => {
-				ws_tx.send(message).await?;
-				Ok(())
-			}
-		}
+		self.ws_tx.lock().await.send(message).await?;
+		Ok(())
 	}
 
 	pub async fn flush(&self) -> Result<()> {
-		let mut state = self.state.lock().await;
-		match &mut *state {
-			WebSocketState::Unaccepted { .. } | WebSocketState::Accepting => {
-				bail!("websocket has not been accepted");
-			}
-			WebSocketState::Split { ws_tx } => {
-				ws_tx.flush().await?;
-				Ok(())
-			}
-		}
+		self.ws_tx.lock().await.flush().await?;
+		Ok(())
 	}
 
-	async fn accept_inner(state: &mut WebSocketState) -> Result<WebSocketReceiver> {
-		if !matches!(*state, WebSocketState::Unaccepted { .. }) {
-			bail!("websocket already accepted")
-		}
-
-		// Accept websocket
-		let old_state = std::mem::replace(&mut *state, WebSocketState::Accepting);
-		let WebSocketState::Unaccepted { websocket } = old_state else {
-			bail!("should be in unaccepted state");
-		};
-
-		// Accept WS
-		let ws_stream = websocket.await?;
-		let (ws_tx, ws_rx) = ws_stream.split();
-		*state = WebSocketState::Split { ws_tx };
-
-		Ok(ws_rx)
+	pub fn recv(&self) -> Arc<Mutex<WebSocketReceiver>> {
+		self.ws_rx.clone()
 	}
 }
diff --git a/engine/packages/guard/Cargo.toml b/engine/packages/guard/Cargo.toml
index e5832eb483..7da5a574f6 100644
--- a/engine/packages/guard/Cargo.toml
+++ b/engine/packages/guard/Cargo.toml
@@ -43,6 +43,7 @@ rustls.workspace = true
 serde_json.workspace = true
 serde.workspace = true
 tokio.workspace = true
+tokio-tungstenite.workspace = true
 tracing.workspace = true
 universaldb.workspace = true
 universalpubsub.workspace = true
diff --git a/engine/packages/guard/src/routing/api_public.rs b/engine/packages/guard/src/routing/api_public.rs
index 43415122da..143db070da 100644
--- a/engine/packages/guard/src/routing/api_public.rs
+++ b/engine/packages/guard/src/routing/api_public.rs
@@ -9,6 +9,7 @@ use hyper::{Request, Response};
 use rivet_guard_core::WebSocketHandle;
 use rivet_guard_core::proxy_service::{ResponseBody, RoutingOutput};
 use rivet_guard_core::{CustomServeTrait, request_context::RequestContext};
+use tokio_tungstenite::tungstenite::protocol::frame::CloseFrame;
 use tower::Service;
 
 struct ApiPublicService {
@@ -50,7 +51,8 @@ impl CustomServeTrait for ApiPublicService {
 		_headers: &hyper::HeaderMap,
 		_path: &str,
 		_request_context: &mut RequestContext,
-	) -> Result<()> {
+		_unique_request_id: Uuid,
+	) -> Result<Option<CloseFrame>> {
 		bail!("api-public does not support WebSocket connections")
 	}
 }
diff --git a/engine/packages/guard/src/routing/pegboard_gateway.rs b/engine/packages/guard/src/routing/pegboard_gateway.rs
index a7d6b7573d..c65bbc5c8a 100644
--- a/engine/packages/guard/src/routing/pegboard_gateway.rs
+++ b/engine/packages/guard/src/routing/pegboard_gateway.rs
@@ -10,9 +10,7 @@ use crate::{errors, shared_state::SharedState};
 
 const ACTOR_READY_TIMEOUT: Duration = Duration::from_secs(10);
 pub const X_RIVET_ACTOR: HeaderName = HeaderName::from_static("x-rivet-actor");
-pub const X_RIVET_AMESPACE: HeaderName = HeaderName::from_static("x-rivet-namespace");
 const WS_PROTOCOL_ACTOR: &str = "rivet_actor.";
-const WS_PROTOCOL_TOKEN: &str = "rivet_token.";
 
 /// Route requests to actor services using path-based routing
 #[tracing::instrument(skip_all)]
@@ -172,9 +170,8 @@ async fn route_request_inner(
 				res = stopped_sub.next() => {
 					res?;
 
-					// Attempt to rewake once
-					if wake_retries < 3 {
-						tracing::debug!(?actor_id, ?wake_retries, "actor stopped while we were waiting for it to beocme ready, attempting rewake");
+					if wake_retries < 16 {
+						tracing::debug!(?actor_id, ?wake_retries, "actor stopped while we were waiting for it to become ready, attempting rewake");
 						wake_retries += 1;
 
 						let res = ctx.signal(pegboard::workflows::actor::Wake {})
@@ -194,6 +191,9 @@ async fn route_request_inner(
 						} else {
 							res?;
 						}
+					} else {
+						tracing::warn!("actor retried waking 16 times, has not yet started");
+						return Err(rivet_guard_core::errors::ServiceUnavailable.build());
 					}
 				}
 				res = fail_sub.next() => {
diff --git a/engine/packages/namespace/src/keys/runner_config.rs b/engine/packages/namespace/src/keys/runner_config.rs
index 1d232805e6..7974a121ff 100644
--- a/engine/packages/namespace/src/keys/runner_config.rs
+++ b/engine/packages/namespace/src/keys/runner_config.rs
@@ -38,7 +38,7 @@ impl FormalKey for DataKey {
 	}
 
 	fn serialize(&self, value: Self::Value) -> Result<Vec<u8>> {
-		rivet_data::versioned::NamespaceRunnerConfig::latest(value.into())
+		rivet_data::versioned::NamespaceRunnerConfig::wrap_latest(value.into())
 			.serialize_with_embedded_version(rivet_data::PEGBOARD_NAMESPACE_RUNNER_CONFIG_VERSION)
 	}
 }
@@ -129,7 +129,7 @@ impl FormalKey for ByVariantKey {
 	}
 
 	fn serialize(&self, value: Self::Value) -> Result<Vec<u8>> {
-		rivet_data::versioned::NamespaceRunnerConfig::latest(value.into())
+		rivet_data::versioned::NamespaceRunnerConfig::wrap_latest(value.into())
 			.serialize_with_embedded_version(rivet_data::PEGBOARD_NAMESPACE_RUNNER_CONFIG_VERSION)
 	}
 }
diff --git a/engine/packages/pegboard-gateway/Cargo.toml b/engine/packages/pegboard-gateway/Cargo.toml
index ec5d7df480..693bf8de57 100644
--- a/engine/packages/pegboard-gateway/Cargo.toml
+++ b/engine/packages/pegboard-gateway/Cargo.toml
@@ -12,6 +12,7 @@ bytes.workspace = true
 futures-util.workspace = true
 gas.workspace = true
 http-body-util.workspace = true
+# TODO: Doesn't match workspace version
 hyper = "1.6"
 hyper-tungstenite.workspace = true
 pegboard.workspace = true
@@ -20,7 +21,9 @@ rivet-error.workspace = true
 rivet-guard-core.workspace = true
 rivet-runner-protocol.workspace = true
 rivet-util.workspace = true
+scc.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 thiserror.workspace = true
 tokio-tungstenite.workspace = true
 tokio.workspace = true
diff --git a/engine/packages/pegboard-gateway/src/lib.rs b/engine/packages/pegboard-gateway/src/lib.rs
index 230afa357c..5bbbd978b4 100644
--- a/engine/packages/pegboard-gateway/src/lib.rs
+++ b/engine/packages/pegboard-gateway/src/lib.rs
@@ -4,26 +4,47 @@ use bytes::Bytes;
 use futures_util::TryStreamExt;
 use gas::prelude::*;
 use http_body_util::{BodyExt, Full};
-use hyper::{Request, Response, StatusCode, header::HeaderName};
+use hyper::{Request, Response, StatusCode};
+use rivet_error::*;
 use rivet_guard_core::{
 	WebSocketHandle,
 	custom_serve::CustomServeTrait,
-	errors::{ServiceUnavailable, WebSocketServiceUnavailable},
+	errors::{
+		ServiceUnavailable, WebSocketServiceRetry, WebSocketServiceTimeout,
+		WebSocketServiceUnavailable,
+	},
 	proxy_service::ResponseBody,
 	request_context::RequestContext,
 };
 use rivet_runner_protocol as protocol;
 use rivet_util::serde::HashableMap;
 use std::time::Duration;
-use tokio_tungstenite::tungstenite::{Message, protocol::frame::coding::CloseCode};
+use tokio::sync::watch;
+use tokio_tungstenite::tungstenite::{
+	Message,
+	protocol::frame::{CloseFrame, coding::CloseCode},
+};
 
 use crate::shared_state::{SharedState, TunnelMessageData};
 
 pub mod shared_state;
 
 const TUNNEL_ACK_TIMEOUT: Duration = Duration::from_secs(2);
-const SEC_WEBSOCKET_PROTOCOL: HeaderName = HeaderName::from_static("sec-websocket-protocol");
-const WS_PROTOCOL_ACTOR: &str = "rivet_actor.";
+
+#[derive(RivetError, Serialize, Deserialize)]
+#[error(
+	"guard",
+	"websocket_pending_limit_reached",
+	"Reached limit on pending websocket messages, aborting connection."
+)]
+pub struct WebsocketPendingLimitReached;
+
+#[derive(Debug)]
+enum LifecycleResult {
+	ServerClose(protocol::ToServerWebSocketClose),
+	ClientClose(Option<CloseFrame>),
+	Aborted,
+}
 
 pub struct PegboardGateway {
 	shared_state: SharedState,
@@ -78,9 +99,10 @@ impl CustomServeTrait for PegboardGateway {
 			pegboard::pubsub_subjects::RunnerReceiverSubject::new(self.runner_id).to_string();
 
 		// Start listening for request responses
-		let (request_id, mut msg_rx) = self
+		let request_id = Uuid::new_v4().into_bytes();
+		let mut msg_rx = self
 			.shared_state
-			.start_in_flight_request(tunnel_subject)
+			.start_in_flight_request(tunnel_subject, request_id)
 			.await;
 
 		// Start request
@@ -111,6 +133,10 @@ impl CustomServeTrait for PegboardGateway {
 						) => {
 							return anyhow::Ok(response_start);
 						}
+						protocol::ToServerTunnelMessageKind::ToServerResponseAbort => {
+							tracing::warn!("request aborted");
+							return Err(ServiceUnavailable.build());
+						}
 						_ => {
 							tracing::warn!("received non-response message from pubsub");
 						}
@@ -122,7 +148,7 @@ impl CustomServeTrait for PegboardGateway {
 				}
 			}
 
-			tracing::warn!("received no message response");
+			tracing::warn!(request_id=?Uuid::from_bytes(request_id), "received no message response during request init");
 			Err(ServiceUnavailable.build())
 		};
 		let response_start = tokio::time::timeout(TUNNEL_ACK_TIMEOUT, fut)
@@ -157,7 +183,8 @@ impl CustomServeTrait for PegboardGateway {
 		headers: &hyper::HeaderMap,
 		_path: &str,
 		_request_context: &mut RequestContext,
-	) -> Result<()> {
+		unique_request_id: Uuid,
+	) -> Result<Option<CloseFrame>> {
 		// Use the actor ID from the gateway instance
 		let actor_id = self.actor_id.to_string();
 
@@ -174,9 +201,10 @@ impl CustomServeTrait for PegboardGateway {
 			pegboard::pubsub_subjects::RunnerReceiverSubject::new(self.runner_id).to_string();
 
 		// Start listening for WebSocket messages
-		let (request_id, mut msg_rx) = self
+		let request_id = unique_request_id.into_bytes();
+		let mut msg_rx = self
 			.shared_state
-			.start_in_flight_request(tunnel_subject.clone())
+			.start_in_flight_request(tunnel_subject.clone(), request_id)
 			.await;
 
 		// Send WebSocket open message
@@ -199,9 +227,9 @@ impl CustomServeTrait for PegboardGateway {
 			while let Some(msg) = msg_rx.recv().await {
 				match msg {
 					TunnelMessageData::Message(
-						protocol::ToServerTunnelMessageKind::ToServerWebSocketOpen,
+						protocol::ToServerTunnelMessageKind::ToServerWebSocketOpen(msg),
 					) => {
-						return anyhow::Ok(());
+						return anyhow::Ok(msg);
 					}
 					TunnelMessageData::Message(
 						protocol::ToServerTunnelMessageKind::ToServerWebSocketClose(close),
@@ -221,10 +249,11 @@ impl CustomServeTrait for PegboardGateway {
 				}
 			}
 
-			tracing::warn!("received no message response");
+			tracing::warn!(request_id=?Uuid::from_bytes(request_id), "received no message response during ws init");
 			Err(WebSocketServiceUnavailable.build())
 		};
-		tokio::time::timeout(TUNNEL_ACK_TIMEOUT, fut)
+
+		let open_msg = tokio::time::timeout(TUNNEL_ACK_TIMEOUT, fut)
 			.await
 			.map_err(|_| {
 				tracing::warn!("timed out waiting for tunnel ack");
@@ -232,120 +261,198 @@ impl CustomServeTrait for PegboardGateway {
 				WebSocketServiceUnavailable.build()
 			})??;
 
-		// Accept the WebSocket
-		let mut ws_rx = client_ws.accept().await?;
+		self.shared_state
+			.toggle_hibernation(request_id, open_msg.can_hibernate)
+			.await?;
+
+		// Send reclaimed messages
+		self.shared_state
+			.resend_pending_websocket_messages(request_id, open_msg.last_msg_index)
+			.await?;
+
+		let ws_rx = client_ws.recv();
 
-		// Spawn task to forward messages from server to client
-		let mut server_to_client = tokio::spawn(
+		let (tunnel_to_ws_abort_tx, mut tunnel_to_ws_abort_rx) = watch::channel(());
+		let (ws_to_tunnel_abort_tx, mut ws_to_tunnel_abort_rx) = watch::channel(());
+
+		// Spawn task to forward messages from tunnel to ws
+		let shared_state = self.shared_state.clone();
+		let tunnel_to_ws = tokio::spawn(
 			async move {
-				while let Some(msg) = msg_rx.recv().await {
-					match msg {
-						TunnelMessageData::Message(
-							protocol::ToServerTunnelMessageKind::ToServerWebSocketMessage(ws_msg),
-						) => {
-							let msg = if ws_msg.binary {
-								Message::Binary(ws_msg.data.into())
+				loop {
+					tokio::select! {
+						res = msg_rx.recv() => {
+							if let Some(msg) = res {
+								match msg {
+									TunnelMessageData::Message(
+										protocol::ToServerTunnelMessageKind::ToServerWebSocketMessage(ws_msg),
+									) => {
+										let msg = if ws_msg.binary {
+											Message::Binary(ws_msg.data.into())
+										} else {
+											Message::Text(
+												String::from_utf8_lossy(&ws_msg.data).into_owned().into(),
+											)
+										};
+										client_ws.send(msg).await?;
+									}
+									TunnelMessageData::Message(
+										protocol::ToServerTunnelMessageKind::ToServerWebSocketMessageAck(ack),
+									) => {
+										shared_state
+											.ack_pending_websocket_messages(request_id, ack.index)
+											.await?;
+									}
+									TunnelMessageData::Message(
+										protocol::ToServerTunnelMessageKind::ToServerWebSocketClose(close),
+									) => {
+										tracing::debug!(?close, "server closed websocket");
+
+
+										if open_msg.can_hibernate && close.retry {
+											// Successful closure
+											return Err(WebSocketServiceRetry.build());
+										} else {
+											return Ok(LifecycleResult::ServerClose(close));
+										}
+									}
+									TunnelMessageData::Timeout => {
+										tracing::warn!("websocket message timeout");
+										return Err(WebSocketServiceTimeout.build());
+									}
+									_ => {}
+								}
 							} else {
-								Message::Text(
-									String::from_utf8_lossy(&ws_msg.data).into_owned().into(),
-								)
-							};
-							client_ws.send(msg).await?;
-						}
-						TunnelMessageData::Message(
-							protocol::ToServerTunnelMessageKind::ToServerWebSocketClose(close),
-						) => {
-							tracing::debug!(?close, "server closed websocket");
-							return Err(WebSocketServiceUnavailable.build());
+								tracing::debug!("tunnel sub closed");
+								return Err(WebSocketServiceRetry.build());
+							}
 						}
-						TunnelMessageData::Timeout => {
-							tracing::warn!("websocket message timeout");
-							return Err(WebSocketServiceUnavailable.build());
+						_ = tunnel_to_ws_abort_rx.changed() => {
+							tracing::debug!("task aborted");
+							return Ok(LifecycleResult::Aborted);
 						}
-						_ => {}
 					}
 				}
-
-				tracing::debug!("sub closed");
-
-				Err(WebSocketServiceUnavailable.build())
 			}
-			.instrument(tracing::info_span!("server_to_client_task")),
+			.instrument(tracing::info_span!("tunnel_to_ws_task")),
 		);
 
-		// Spawn task to forward messages from client to server
+		// Spawn task to forward messages from ws to tunnel
 		let shared_state_clone = self.shared_state.clone();
-		let mut client_to_server = tokio::spawn(
+		let ws_to_tunnel = tokio::spawn(
 			async move {
-				while let Some(msg) = ws_rx.try_next().await? {
-					match msg {
-						Message::Binary(data) => {
-							let ws_message =
-								protocol::ToClientTunnelMessageKind::ToClientWebSocketMessage(
-									protocol::ToClientWebSocketMessage {
-										data: data.into(),
-										binary: true,
-									},
-								);
-							shared_state_clone
-								.send_message(request_id, ws_message)
-								.await?;
-						}
-						Message::Text(text) => {
-							let ws_message =
-								protocol::ToClientTunnelMessageKind::ToClientWebSocketMessage(
-									protocol::ToClientWebSocketMessage {
-										data: text.as_bytes().to_vec(),
-										binary: false,
-									},
-								);
-							shared_state_clone
-								.send_message(request_id, ws_message)
-								.await?;
+				let mut ws_rx = ws_rx.lock().await;
+
+				loop {
+					tokio::select! {
+						res = ws_rx.try_next() => {
+							if let Some(msg) = res? {
+								match msg {
+									Message::Binary(data) => {
+										let ws_message =
+											protocol::ToClientTunnelMessageKind::ToClientWebSocketMessage(
+												protocol::ToClientWebSocketMessage {
+													// NOTE: This gets set in shared_state.ts
+													index: 0,
+													data: data.into(),
+													binary: true,
+												},
+											);
+										shared_state_clone
+											.send_message(request_id, ws_message)
+											.await?;
+									}
+									Message::Text(text) => {
+										let ws_message =
+											protocol::ToClientTunnelMessageKind::ToClientWebSocketMessage(
+												protocol::ToClientWebSocketMessage {
+													// NOTE: This gets set in shared_state.ts
+													index: 0,
+													data: text.as_bytes().to_vec(),
+													binary: false,
+												},
+											);
+										shared_state_clone
+											.send_message(request_id, ws_message)
+											.await?;
+									}
+									Message::Close(close) => {
+										return Ok(LifecycleResult::ClientClose(close));
+									}
+									_ => {}
+								}
+							} else {
+								tracing::debug!("websocket stream closed");
+								return Ok(LifecycleResult::ClientClose(None));
+							}
 						}
-						Message::Close(_) => {
-							return Ok(());
+						_ = ws_to_tunnel_abort_rx.changed() => {
+							tracing::debug!("task aborted");
+							return Ok(LifecycleResult::Aborted);
 						}
-						_ => {}
-					}
+					};
 				}
-
-				tracing::debug!("websocket stream closed");
-
-				Ok(())
 			}
-			.instrument(tracing::info_span!("client_to_server_task")),
+			.instrument(tracing::info_span!("ws_to_tunnel_task")),
 		);
 
-		// Wait for either task to complete
-		let lifecycle_res = tokio::select! {
-			res = &mut server_to_client => {
-				let res = res?;
-				tracing::debug!(?res, "server to client task completed");
+		// Wait for both tasks to complete
+		let (tunnel_to_ws_res, ws_to_tunnel_res) = tokio::join!(
+			async {
+				let res = tunnel_to_ws.await?;
+
+				// Abort other if not aborted
+				if !matches!(res, Ok(LifecycleResult::Aborted)) {
+					tracing::debug!(?res, "tunnel to ws task completed, aborting counterpart");
+
+					drop(ws_to_tunnel_abort_tx);
+				} else {
+					tracing::debug!(?res, "tunnel to ws task completed");
+				}
+
 				res
-			}
-			res = &mut client_to_server => {
-				let res = res?;
-				tracing::debug!(?res, "client to server task completed");
+			},
+			async {
+				let res = ws_to_tunnel.await?;
+
+				// Abort other if not aborted
+				if !matches!(res, Ok(LifecycleResult::Aborted)) {
+					tracing::debug!(?res, "ws to tunnel task completed, aborting counterpart");
+
+					drop(tunnel_to_ws_abort_tx);
+				} else {
+					tracing::debug!(?res, "ws to tunnel task completed");
+				}
+
 				res
 			}
-		};
-
-		// Abort remaining tasks
-		server_to_client.abort();
-		client_to_server.abort();
+		);
 
-		let (close_code, close_reason) = if lifecycle_res.is_ok() {
-			(CloseCode::Normal.into(), None)
-		} else {
-			(CloseCode::Error.into(), Some("ws.downstream_closed".into()))
+		// Determine single result from both tasks
+		let mut lifecycle_res = match (tunnel_to_ws_res, ws_to_tunnel_res) {
+			// Prefer error
+			(_, Err(err)) => Err(err),
+			(Err(err), _) => Err(err),
+			// Prefer non aborted result if both succeed
+			(Ok(res), Ok(LifecycleResult::Aborted)) => Ok(res),
+			(Ok(LifecycleResult::Aborted), Ok(res)) => Ok(res),
+			// Prefer tunnel to ws if both succeed (unlikely case)
+			(res, _) => res,
 		};
 
 		// Send WebSocket close message to runner
+		let (close_code, close_reason) = match &mut lifecycle_res {
+			// Taking here because it won't be used again
+			Ok(LifecycleResult::ClientClose(Some(close))) => {
+				(close.code, Some(std::mem::take(&mut close.reason)))
+			}
+			Ok(_) => (CloseCode::Normal.into(), None),
+			Err(_) => (CloseCode::Error.into(), Some("ws.downstream_closed".into())),
+		};
 		let close_message = protocol::ToClientTunnelMessageKind::ToClientWebSocketClose(
 			protocol::ToClientWebSocketClose {
-				code: Some(close_code),
-				reason: close_reason,
+				code: Some(close_code.into()),
+				reason: close_reason.map(|x| x.as_str().to_string()),
 			},
 		);
 
@@ -357,6 +464,20 @@ impl CustomServeTrait for PegboardGateway {
 			tracing::error!(?err, "error sending close message");
 		}
 
-		lifecycle_res
+		// Send WebSocket close message to client
+		match lifecycle_res {
+			Ok(LifecycleResult::ServerClose(close)) => {
+				if let Some(code) = close.code {
+					Ok(Some(CloseFrame {
+						code: code.into(),
+						reason: close.reason.unwrap_or_default().into(),
+					}))
+				} else {
+					Ok(None)
+				}
+			}
+			Ok(_) => Ok(None),
+			Err(err) => Err(err),
+		}
 	}
 }
diff --git a/engine/packages/pegboard-gateway/src/shared_state.rs b/engine/packages/pegboard-gateway/src/shared_state.rs
index 7d93e4e93d..08abdf8aca 100644
--- a/engine/packages/pegboard-gateway/src/shared_state.rs
+++ b/engine/packages/pegboard-gateway/src/shared_state.rs
@@ -1,18 +1,26 @@
 use anyhow::Result;
 use gas::prelude::*;
 use rivet_runner_protocol::{self as protocol, MessageId, PROTOCOL_VERSION, RequestId, versioned};
+use scc::{HashMap, hash_map::Entry};
 use std::{
-	collections::HashMap,
 	ops::Deref,
 	sync::Arc,
 	time::{Duration, Instant},
 };
-use tokio::sync::{Mutex, mpsc};
+use tokio::sync::mpsc;
 use universalpubsub::{NextOutput, PubSub, PublishOpts, Subscriber};
 use vbare::OwnedVersionedData;
 
-const GC_INTERVAL: Duration = Duration::from_secs(60);
-const MESSAGE_ACK_TIMEOUT: Duration = Duration::from_secs(5);
+use crate::WebsocketPendingLimitReached;
+
+const GC_INTERVAL: Duration = Duration::from_secs(15);
+const MESSAGE_ACK_TIMEOUT: Duration = Duration::from_secs(30);
+const MAX_PENDING_MSGS_SIZE_PER_REQ: u64 = util::size::mebibytes(1);
+
+pub enum TunnelMessageData {
+	Message(protocol::ToServerTunnelMessageKind),
+	Timeout,
+}
 
 struct InFlightRequest {
 	/// UPS subject to send messages to for this request.
@@ -21,23 +29,30 @@ struct InFlightRequest {
 	msg_tx: mpsc::Sender<TunnelMessageData>,
 	/// True once first message for this request has been sent (so runner learned reply_to).
 	opened: bool,
+	pending_msgs: Vec<PendingMessage>,
+	hibernation_state: Option<HibernationState>,
 }
 
-struct PendingMessage {
-	request_id: RequestId,
+pub struct PendingMessage {
+	message_id: MessageId,
 	send_instant: Instant,
 }
 
-pub enum TunnelMessageData {
-	Message(protocol::ToServerTunnelMessageKind),
-	Timeout,
+struct HibernationState {
+	total_pending_ws_msgs_size: u64,
+	last_ws_msg_index: u16,
+	pending_ws_msgs: Vec<PendingWebsocketMessage>,
+}
+
+pub struct PendingWebsocketMessage {
+	payload: Vec<u8>,
+	send_instant: Instant,
 }
 
 pub struct SharedStateInner {
 	ups: PubSub,
 	receiver_subject: String,
-	requests_in_flight: Mutex<HashMap<RequestId, InFlightRequest>>,
-	pending_messages: Mutex<HashMap<MessageId, PendingMessage>>,
+	in_flight_requests: HashMap<RequestId, InFlightRequest>,
 }
 
 #[derive(Clone)]
@@ -52,8 +67,7 @@ impl SharedState {
 		Self(Arc::new(SharedStateInner {
 			ups,
 			receiver_subject,
-			requests_in_flight: Mutex::new(HashMap::new()),
-			pending_messages: Mutex::new(HashMap::new()),
+			in_flight_requests: HashMap::new(),
 		}))
 	}
 
@@ -69,44 +83,67 @@ impl SharedState {
 		Ok(())
 	}
 
+	pub async fn start_in_flight_request(
+		&self,
+		receiver_subject: String,
+		request_id: RequestId,
+	) -> mpsc::Receiver<TunnelMessageData> {
+		let (msg_tx, msg_rx) = mpsc::channel(128);
+
+		match self.in_flight_requests.entry_async(request_id).await {
+			Entry::Vacant(entry) => {
+				entry.insert_entry(InFlightRequest {
+					receiver_subject,
+					msg_tx,
+					opened: false,
+					pending_msgs: Vec::new(),
+					hibernation_state: None,
+				});
+			}
+			Entry::Occupied(mut entry) => {
+				entry.receiver_subject = receiver_subject;
+				entry.msg_tx = msg_tx;
+				entry.opened = false;
+				entry.pending_msgs.clear();
+			}
+		}
+
+		msg_rx
+	}
+
 	pub async fn send_message(
 		&self,
 		request_id: RequestId,
-		message_kind: protocol::ToClientTunnelMessageKind,
+		mut message_kind: protocol::ToClientTunnelMessageKind,
 	) -> Result<()> {
 		let message_id = Uuid::new_v4().as_bytes().clone();
 
-		// Get subject and whether this is the first message for this request
-		let (tunnel_receiver_subject, include_reply_to) = {
-			let mut requests_in_flight = self.requests_in_flight.lock().await;
-			if let Some(req) = requests_in_flight.get_mut(&request_id) {
-				let receiver_subject = req.receiver_subject.clone();
-				let include_reply_to = !req.opened;
-				if include_reply_to {
-					// Mark as opened so subsequent messages skip reply_to
-					req.opened = true;
-				}
-				(receiver_subject, include_reply_to)
-			} else {
-				bail!("request not in flight")
-			}
-		};
+		let mut req = self
+			.in_flight_requests
+			.get_async(&request_id)
+			.await
+			.context("request not in flight")?;
 
-		// Save pending message
-		{
-			let mut pending_messages = self.pending_messages.lock().await;
-			pending_messages.insert(
-				message_id,
-				PendingMessage {
-					request_id,
-					send_instant: Instant::now(),
-				},
-			);
+		let include_reply_to = !req.opened;
+		if include_reply_to {
+			// Mark as opened so subsequent messages skip reply_to
+			req.opened = true;
 		}
 
-		// Send message
-		let message = protocol::ToClient::ToClientTunnelMessage(protocol::ToClientTunnelMessage {
-			request_id,
+		let ws_msg_index =
+			if let (Some(hs), protocol::ToClientTunnelMessageKind::ToClientWebSocketMessage(msg)) =
+				(&req.hibernation_state, &mut message_kind)
+			{
+				// TODO: This ends up skipping 0 as an index when initiated but whatever
+				msg.index = hs.last_ws_msg_index.wrapping_add(1);
+
+				Some(msg.index)
+			} else {
+				None
+			};
+
+		let payload = protocol::ToClientTunnelMessage {
+			request_id: request_id.clone(),
 			message_id,
 			// Only send reply to subject on the first message for this request. This reduces
 			// overhead of subsequent messages.
@@ -116,12 +153,41 @@ impl SharedState {
 				None
 			},
 			message_kind,
+		};
+
+		let now = Instant::now();
+		req.pending_msgs.push(PendingMessage {
+			message_id,
+			send_instant: now,
 		});
-		let message_serialized = versioned::ToClient::latest(message)
+
+		// Send message
+		let message = protocol::ToClient::ToClientTunnelMessage(payload);
+		let message_serialized = versioned::ToClient::wrap_latest(message)
 			.serialize_with_embedded_version(PROTOCOL_VERSION)?;
+
+		if let (Some(hs), Some(ws_msg_index)) = (&mut req.hibernation_state, ws_msg_index) {
+			hs.total_pending_ws_msgs_size += message_serialized.len() as u64;
+
+			if hs.total_pending_ws_msgs_size > MAX_PENDING_MSGS_SIZE_PER_REQ
+				|| hs.pending_ws_msgs.len() >= u16::MAX as usize
+			{
+				return Err(WebsocketPendingLimitReached {}.build());
+			}
+
+			hs.last_ws_msg_index = ws_msg_index;
+
+			let pending_ws_msg = PendingWebsocketMessage {
+				payload: message_serialized.clone(),
+				send_instant: now,
+			};
+
+			hs.pending_ws_msgs.push(pending_ws_msg);
+		}
+
 		self.ups
 			.publish(
-				&tunnel_receiver_subject,
+				&req.receiver_subject,
 				&message_serialized,
 				PublishOpts::one(),
 			)
@@ -130,23 +196,6 @@ impl SharedState {
 		Ok(())
 	}
 
-	pub async fn start_in_flight_request(
-		&self,
-		receiver_subject: String,
-	) -> (RequestId, mpsc::Receiver<TunnelMessageData>) {
-		let id = Uuid::new_v4().into_bytes();
-		let (msg_tx, msg_rx) = mpsc::channel(128);
-		self.requests_in_flight.lock().await.insert(
-			id,
-			InFlightRequest {
-				receiver_subject,
-				msg_tx,
-				opened: false,
-			},
-		);
-		(id, msg_rx)
-	}
-
 	async fn receiver(&self, mut sub: Subscriber) {
 		while let Ok(NextOutput::Message(msg)) = sub.next().await {
 			tracing::trace!(
@@ -157,31 +206,37 @@ impl SharedState {
 			match versioned::ToGateway::deserialize_with_embedded_version(&msg.payload) {
 				Ok(protocol::ToGateway { message: msg }) => {
 					tracing::debug!(
-						?msg.request_id,
-						?msg.message_id,
+						request_id=?Uuid::from_bytes(msg.request_id),
+						message_id=?Uuid::from_bytes(msg.message_id),
 						"successfully deserialized message"
 					);
+
+					let Some(mut in_flight) =
+						self.in_flight_requests.get_async(&msg.request_id).await
+					else {
+						tracing::debug!(
+							request_id=?Uuid::from_bytes(msg.request_id),
+							"in flight has already been disconnected"
+						);
+						continue;
+					};
+
 					if let protocol::ToServerTunnelMessageKind::TunnelAck = &msg.message_kind {
-						// Handle ack message
+						let prev_len = in_flight.pending_msgs.len();
+
+						in_flight
+							.pending_msgs
+							.retain(|m| m.message_id != msg.message_id);
 
-						let mut pending_messages = self.pending_messages.lock().await;
-						if pending_messages.remove(&msg.message_id).is_none() {
+						if prev_len == in_flight.pending_msgs.len() {
 							tracing::warn!(
 								"pending message does not exist or ack received after message body"
-							);
+							)
 						}
 					} else {
 						// Send message to the request handler to emulate the real network action
-						let requests_in_flight = self.requests_in_flight.lock().await;
-						let Some(in_flight) = requests_in_flight.get(&msg.request_id) else {
-							tracing::debug!(
-								?msg.request_id,
-								"in flight has already been disconnected"
-							);
-							continue;
-						};
 						tracing::debug!(
-							?msg.request_id,
+							request_id=?Uuid::from_bytes(msg.request_id),
 							"forwarding message to request handler"
 						);
 						let _ = in_flight
@@ -200,15 +255,16 @@ impl SharedState {
 								message_kind: protocol::ToClientTunnelMessageKind::TunnelAck,
 							},
 						);
-						let ack_message_serialized = match versioned::ToClient::latest(ack_message)
-							.serialize_with_embedded_version(PROTOCOL_VERSION)
-						{
-							Ok(x) => x,
-							Err(err) => {
-								tracing::error!(?err, "failed to serialize ack");
-								continue;
-							}
-						};
+						let ack_message_serialized =
+							match versioned::ToClient::wrap_latest(ack_message)
+								.serialize_with_embedded_version(PROTOCOL_VERSION)
+							{
+								Ok(x) => x,
+								Err(err) => {
+									tracing::error!(?err, "failed to serialize ack");
+									continue;
+								}
+							};
 						tokio::spawn(async move {
 							if let Err(err) = ups_clone
 								.publish(
@@ -230,46 +286,158 @@ impl SharedState {
 		}
 	}
 
+	pub async fn toggle_hibernation(&self, request_id: RequestId, enable: bool) -> Result<()> {
+		let mut req = self
+			.in_flight_requests
+			.get_async(&request_id)
+			.await
+			.context("request not in flight")?;
+
+		match (req.hibernation_state.is_some(), enable) {
+			(true, true) => {}
+			(true, false) => req.hibernation_state = None,
+			(false, true) => {
+				req.hibernation_state = Some(HibernationState {
+					total_pending_ws_msgs_size: 0,
+					last_ws_msg_index: 0,
+					pending_ws_msgs: Vec::new(),
+				});
+			}
+			(false, false) => {}
+		}
+
+		Ok(())
+	}
+
+	pub async fn resend_pending_websocket_messages(
+		&self,
+		request_id: RequestId,
+		last_msg_index: i64,
+	) -> Result<()> {
+		let Some(mut req) = self.in_flight_requests.get_async(&request_id).await else {
+			bail!("request not in flight");
+		};
+
+		let receiver_subject = req.receiver_subject.clone();
+
+		if let Some(hs) = &mut req.hibernation_state {
+			if !hs.pending_ws_msgs.is_empty() {
+				tracing::debug!(request_id=?Uuid::from_bytes(request_id.clone()), len=?hs.pending_ws_msgs.len(), ?last_msg_index, "resending pending messages");
+
+				let len = hs.pending_ws_msgs.len().try_into()?;
+
+				for (iter_index, pending_msg) in hs.pending_ws_msgs.iter().enumerate() {
+					let msg_index = hs
+						.last_ws_msg_index
+						.wrapping_sub(len)
+						.wrapping_add(1)
+						.wrapping_add(iter_index.try_into()?);
+
+					if last_msg_index < 0 || wrapping_gt(msg_index, last_msg_index.try_into()?) {
+						self.ups
+							.publish(&receiver_subject, &pending_msg.payload, PublishOpts::one())
+							.await?;
+					}
+				}
+
+				// Perform ack
+				if last_msg_index >= 0 {
+					let last_msg_index = last_msg_index.try_into()?;
+					let mut iter_index = 0;
+
+					hs.pending_ws_msgs.retain(|_| {
+						let msg_index = hs
+							.last_ws_msg_index
+							.wrapping_sub(len)
+							.wrapping_add(1)
+							.wrapping_add(iter_index);
+						let keep = wrapping_gt(msg_index, last_msg_index);
+
+						iter_index += 1;
+
+						keep
+					});
+
+					if hs.pending_ws_msgs.is_empty() {
+						hs.last_ws_msg_index = last_msg_index;
+					}
+				}
+			}
+		}
+
+		Ok(())
+	}
+
+	pub async fn ack_pending_websocket_messages(
+		&self,
+		request_id: RequestId,
+		ack_index: u16,
+	) -> Result<()> {
+		let Some(mut req) = self.in_flight_requests.get_async(&request_id).await else {
+			bail!("request not in flight");
+		};
+
+		let Some(hs) = &mut req.hibernation_state else {
+			tracing::warn!("cannot ack ws messages, hibernation is not enabled");
+			return Ok(());
+		};
+
+		let len = hs.pending_ws_msgs.len().try_into()?;
+		let mut iter_index = 0u16;
+		hs.pending_ws_msgs.retain(|_| {
+			let msg_index = hs
+				.last_ws_msg_index
+				.wrapping_sub(len)
+				.wrapping_add(1)
+				.wrapping_add(iter_index);
+			let keep = wrapping_gt(msg_index, ack_index);
+
+			iter_index += 1;
+
+			keep
+		});
+
+		Ok(())
+	}
+
 	async fn gc(&self) {
 		let mut interval = tokio::time::interval(GC_INTERVAL);
+		interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+
 		loop {
 			interval.tick().await;
 
 			let now = Instant::now();
 
-			// Purge unacked messages
-			{
-				let mut pending_messages = self.pending_messages.lock().await;
-				let mut removed_req_ids = Vec::new();
-				pending_messages.retain(|_k, v| {
-					if now.duration_since(v.send_instant) > MESSAGE_ACK_TIMEOUT {
-						// Expired
-						removed_req_ids.push(v.request_id.clone());
-						false
-					} else {
-						true
+			self.in_flight_requests
+				.retain_async(|_, req| {
+					if req.msg_tx.is_closed() {
+						return false;
 					}
-				});
 
-				// Close in-flight messages
-				let requests_in_flight = self.requests_in_flight.lock().await;
-				for req_id in removed_req_ids {
-					if let Some(x) = requests_in_flight.get(&req_id) {
-						let _ = x.msg_tx.send(TunnelMessageData::Timeout);
-					} else {
-						tracing::warn!(
-							?req_id,
-							"message expired for in flight that does not exist"
-						);
+					let mut keep = true;
+
+					if let Some(earliest_pending_msg) = req.pending_msgs.first() {
+						keep = now.duration_since(earliest_pending_msg.send_instant)
+							> MESSAGE_ACK_TIMEOUT;
 					}
-				}
-			}
 
-			// Purge no longer in flight
-			{
-				let mut requests_in_flight = self.requests_in_flight.lock().await;
-				requests_in_flight.retain(|_k, v| !v.msg_tx.is_closed());
-			}
+					if let Some(hs) = &req.hibernation_state {
+						if let (true, Some(earliest_pending_ws_msg)) =
+							(keep, hs.pending_ws_msgs.first())
+						{
+							keep = now.duration_since(earliest_pending_ws_msg.send_instant)
+								> MESSAGE_ACK_TIMEOUT;
+						}
+					}
+
+					if !keep {
+						let _ = req.msg_tx.send(TunnelMessageData::Timeout);
+					}
+
+					keep
+				})
+				.await;
 		}
 	}
 }
@@ -281,3 +449,11 @@ impl Deref for SharedState {
 		&self.0
 	}
 }
+
+fn wrapping_gt(a: u16, b: u16) -> bool {
+	a != b && a.wrapping_sub(b) < u16::MAX / 2
+}
+
+// fn wrapping_lt(a: u16, b: u16) -> bool {
+//     b.wrapping_sub(a) < u16::MAX / 2
+// }
diff --git a/engine/packages/pegboard-runner/src/client_to_pubsub_task.rs b/engine/packages/pegboard-runner/src/client_to_pubsub_task.rs
index 99b9ec6e1c..8b72bb69ff 100644
--- a/engine/packages/pegboard-runner/src/client_to_pubsub_task.rs
+++ b/engine/packages/pegboard-runner/src/client_to_pubsub_task.rs
@@ -8,14 +8,22 @@ use pegboard_actor_kv as kv;
 use rivet_guard_core::websocket_handle::WebSocketReceiver;
 use rivet_runner_protocol::{self as protocol, PROTOCOL_VERSION, versioned};
 use std::sync::{Arc, atomic::Ordering};
+use tokio::sync::Mutex;
 use universalpubsub::PublishOpts;
 use vbare::OwnedVersionedData;
 
 use crate::conn::Conn;
 
 #[tracing::instrument(skip_all, fields(runner_id=?conn.runner_id, workflow_id=?conn.workflow_id, protocol_version=%conn.protocol_version))]
-pub async fn task(ctx: StandaloneCtx, conn: Arc<Conn>, mut ws_rx: WebSocketReceiver) -> Result<()> {
+pub async fn task(
+	ctx: StandaloneCtx,
+	conn: Arc<Conn>,
+	ws_rx: Arc<Mutex<WebSocketReceiver>>,
+) -> Result<()> {
 	tracing::debug!("starting WebSocket to pubsub forwarding task");
+
+	let mut ws_rx = ws_rx.lock().await;
+
 	while let Some(msg) = ws_rx.try_next().await? {
 		match msg {
 			WsMessage::Binary(data) => {
@@ -27,7 +35,7 @@ pub async fn task(ctx: StandaloneCtx, conn: Arc<Conn>, mut ws_rx: WebSocketRecei
 				// Parse message
 				let msg =
 					match versioned::ToServer::deserialize_version(&data, conn.protocol_version)
-						.and_then(|x| x.into_latest())
+						.and_then(|x| x.unwrap_latest())
 					{
 						Ok(x) => x,
 						Err(err) => {
@@ -87,7 +95,7 @@ async fn handle_message(
 			let actor_id = match Id::parse(&req.actor_id) {
 				Ok(actor_id) => actor_id,
 				Err(err) => {
-					let res_msg = versioned::ToClient::latest(
+					let res_msg = versioned::ToClient::wrap_latest(
 						protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse {
 							request_id: req.request_id,
 							data: protocol::KvResponseData::KvErrorResponse(
@@ -124,16 +132,16 @@ async fn handle_message(
 
 			// Verify actor belongs to this runner
 			if !actor_belongs {
-				let res_msg = versioned::ToClient::latest(protocol::ToClient::ToClientKvResponse(
-					protocol::ToClientKvResponse {
+				let res_msg = versioned::ToClient::wrap_latest(
+					protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse {
 						request_id: req.request_id,
 						data: protocol::KvResponseData::KvErrorResponse(
 							protocol::KvErrorResponse {
 								message: "given actor does not belong to runner".to_string(),
 							},
 						),
-					},
-				));
+					}),
+				);
 
 				let res_msg_serialized = res_msg
 					.serialize(conn.protocol_version)
@@ -152,7 +160,7 @@ async fn handle_message(
 				protocol::KvRequestData::KvGetRequest(body) => {
 					let res = kv::get(&*ctx.udb()?, actor_id, body.keys).await;
 
-					let res_msg = versioned::ToClient::latest(
+					let res_msg = versioned::ToClient::wrap_latest(
 						protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse {
 							request_id: req.request_id,
 							data: match res {
@@ -196,7 +204,7 @@ async fn handle_message(
 					)
 					.await;
 
-					let res_msg = versioned::ToClient::latest(
+					let res_msg = versioned::ToClient::wrap_latest(
 						protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse {
 							request_id: req.request_id,
 							data: match res {
@@ -230,7 +238,7 @@ async fn handle_message(
 				protocol::KvRequestData::KvPutRequest(body) => {
 					let res = kv::put(&*ctx.udb()?, actor_id, body.keys, body.values).await;
 
-					let res_msg = versioned::ToClient::latest(
+					let res_msg = versioned::ToClient::wrap_latest(
 						protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse {
 							request_id: req.request_id,
 							data: match res {
@@ -258,7 +266,7 @@ async fn handle_message(
 				protocol::KvRequestData::KvDeleteRequest(body) => {
 					let res = kv::delete(&*ctx.udb()?, actor_id, body.keys).await;
 
-					let res_msg = versioned::ToClient::latest(
+					let res_msg = versioned::ToClient::wrap_latest(
 						protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse {
 							request_id: req.request_id,
 							data: match res {
@@ -284,7 +292,7 @@ async fn handle_message(
 				protocol::KvRequestData::KvDropRequest => {
 					let res = kv::delete_all(&*ctx.udb()?, actor_id).await;
 
-					let res_msg = versioned::ToClient::latest(
+					let res_msg = versioned::ToClient::wrap_latest(
 						protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse {
 							request_id: req.request_id,
 							data: match res {
@@ -360,7 +368,7 @@ async fn handle_tunnel_message(
 	}
 
 	// Publish message to UPS
-	let msg_serialized = versioned::ToGateway::latest(protocol::ToGateway { message: msg })
+	let msg_serialized = versioned::ToGateway::wrap_latest(protocol::ToGateway { message: msg })
 		.serialize_with_embedded_version(PROTOCOL_VERSION)
 		.context("failed to serialize tunnel message for gateway")?;
 	ctx.ups()
diff --git a/engine/packages/pegboard-runner/src/conn.rs b/engine/packages/pegboard-runner/src/conn.rs
index 7649717cd5..198bf0fb53 100644
--- a/engine/packages/pegboard-runner/src/conn.rs
+++ b/engine/packages/pegboard-runner/src/conn.rs
@@ -4,7 +4,7 @@ use gas::prelude::Id;
 use gas::prelude::*;
 use hyper_tungstenite::tungstenite::Message;
 use pegboard::ops::runner::update_alloc_idx::{Action, RunnerEligibility};
-use rivet_guard_core::{WebSocketHandle, websocket_handle::WebSocketReceiver};
+use rivet_guard_core::WebSocketHandle;
 use rivet_runner_protocol as protocol;
 use rivet_runner_protocol::*;
 use std::{
@@ -20,6 +20,7 @@ use crate::{errors::WsError, utils::UrlData};
 pub struct TunnelActiveRequest {
 	/// Subject to send replies to.
 	pub gateway_reply_to: String,
+	pub is_ws: bool,
 }
 
 pub struct Conn {
@@ -42,7 +43,6 @@ pub struct Conn {
 pub async fn init_conn(
 	ctx: &StandaloneCtx,
 	ws_handle: WebSocketHandle,
-	ws_rx: &mut WebSocketReceiver,
 	UrlData {
 		protocol_version,
 		namespace,
@@ -59,6 +59,9 @@ pub async fn init_conn(
 
 	tracing::debug!("new runner connection");
 
+	let ws_rx = ws_handle.recv();
+	let mut ws_rx = ws_rx.lock().await;
+
 	// Receive init packet
 	let (runner_id, workflow_id) = if let Some(msg) =
 		tokio::time::timeout(Duration::from_secs(5), ws_rx.next())
diff --git a/engine/packages/pegboard-runner/src/lib.rs b/engine/packages/pegboard-runner/src/lib.rs
index 95b4a1591d..0864e32a50 100644
--- a/engine/packages/pegboard-runner/src/lib.rs
+++ b/engine/packages/pegboard-runner/src/lib.rs
@@ -11,7 +11,7 @@ use rivet_guard_core::{
 };
 use rivet_runner_protocol as protocol;
 use std::time::Duration;
-use tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode;
+use tokio_tungstenite::tungstenite::protocol::frame::{CloseFrame, coding::CloseCode};
 use universalpubsub::PublishOpts;
 use vbare::OwnedVersionedData;
 
@@ -61,7 +61,8 @@ impl CustomServeTrait for PegboardRunnerWsCustomServe {
 		_headers: &hyper::HeaderMap,
 		path: &str,
 		_request_context: &mut RequestContext,
-	) -> Result<()> {
+		_unique_request_id: Uuid,
+	) -> Result<Option<CloseFrame>> {
 		// Get UPS
 		let ups = self.ctx.ups().context("failed to get UPS instance")?;
 
@@ -73,14 +74,8 @@ impl CustomServeTrait for PegboardRunnerWsCustomServe {
 
 		tracing::debug!(?path, "tunnel ws connection established");
 
-		// Accept WS
-		let mut ws_rx = ws_handle
-			.accept()
-			.await
-			.context("failed to accept WebSocket connection")?;
-
 		// Create connection
-		let conn = conn::init_conn(&self.ctx, ws_handle.clone(), &mut ws_rx, url_data)
+		let conn = conn::init_conn(&self.ctx, ws_handle.clone(), url_data)
 			.await
 			.context("failed to initialize runner connection")?;
 
@@ -101,7 +96,7 @@ impl CustomServeTrait for PegboardRunnerWsCustomServe {
 		let mut client_to_pubsub = tokio::spawn(client_to_pubsub_task::task(
 			self.ctx.clone(),
 			conn.clone(),
-			ws_rx,
+			ws_handle.recv(),
 		));
 
 		// Update pings
@@ -152,24 +147,30 @@ impl CustomServeTrait for PegboardRunnerWsCustomServe {
 		// Send WebSocket close messages to all remaining active requests
 		let active_requests = conn.tunnel_active_requests.lock().await;
 		for (request_id, req) in &*active_requests {
-			let (close_code, close_reason) = if lifecycle_res.is_ok() {
-				(CloseCode::Normal.into(), None)
+			let close_msg_kind = if req.is_ws {
+				let (close_code, close_reason) = if lifecycle_res.is_ok() {
+					(CloseCode::Normal.into(), None)
+				} else {
+					(CloseCode::Error.into(), Some("ws.upstream_closed".into()))
+				};
+
+				protocol::ToServerTunnelMessageKind::ToServerWebSocketClose(
+					protocol::ToServerWebSocketClose {
+						code: Some(close_code),
+						reason: close_reason,
+						retry: true,
+					},
+				)
 			} else {
-				(CloseCode::Error.into(), Some("ws.upstream_closed".into()))
+				protocol::ToServerTunnelMessageKind::ToServerResponseAbort
 			};
-
 			let close_message = protocol::ToServerTunnelMessage {
 				request_id: request_id.clone(),
 				message_id: Uuid::new_v4().into_bytes(),
-				message_kind: protocol::ToServerTunnelMessageKind::ToServerWebSocketClose(
-					protocol::ToServerWebSocketClose {
-						code: Some(close_code),
-						reason: close_reason,
-					},
-				),
+				message_kind: close_msg_kind,
 			};
 
-			let msg_serialized = protocol::versioned::ToGateway::latest(protocol::ToGateway {
+			let msg_serialized = protocol::versioned::ToGateway::wrap_latest(protocol::ToGateway {
 				message: close_message.clone(),
 			})
 			.serialize_with_embedded_version(protocol::PROTOCOL_VERSION)
@@ -193,6 +194,6 @@ impl CustomServeTrait for PegboardRunnerWsCustomServe {
 		}
 
 		// This will determine the close frame sent back to the runner websocket
-		lifecycle_res
+		lifecycle_res.map(|_| None)
 	}
 }
diff --git a/engine/packages/pegboard-runner/src/pubsub_to_client_task.rs b/engine/packages/pegboard-runner/src/pubsub_to_client_task.rs
index 9dc4179a2a..9d249b0659 100644
--- a/engine/packages/pegboard-runner/src/pubsub_to_client_task.rs
+++ b/engine/packages/pegboard-runner/src/pubsub_to_client_task.rs
@@ -37,25 +37,50 @@ pub async fn task(conn: Arc<Conn>, mut sub: Subscriber) -> Result<()> {
 			protocol::ToClient::ToClientClose => return Err(errors::WsError::Eviction.build()),
 			// Handle tunnel messages
 			protocol::ToClient::ToClientTunnelMessage(tunnel_msg) => {
-				// Save active request
-				//
-				// This will remove gateway_reply_to from the message since it does not need to be sent to the
-				// client
-				if let Some(reply_to) = tunnel_msg.gateway_reply_to.take() {
-					tracing::debug!(?tunnel_msg.request_id, ?reply_to, "creating active request");
-					let mut active_requests = conn.tunnel_active_requests.lock().await;
-					active_requests.insert(
-						tunnel_msg.request_id,
-						TunnelActiveRequest {
-							gateway_reply_to: reply_to,
-						},
-					);
-				}
-
 				match tunnel_msg.message_kind {
+					protocol::ToClientTunnelMessageKind::ToClientRequestStart(_) => {
+						// Save active request
+						//
+						// This will remove gateway_reply_to from the message since it does not need to be sent to the
+						// client
+						if let Some(reply_to) = tunnel_msg.gateway_reply_to.take() {
+							tracing::debug!(request_id=?Uuid::from_bytes(tunnel_msg.request_id), ?reply_to, "creating active request");
+							let mut active_requests = conn.tunnel_active_requests.lock().await;
+							active_requests.insert(
+								tunnel_msg.request_id,
+								TunnelActiveRequest {
+									gateway_reply_to: reply_to,
+									is_ws: false,
+								},
+							);
+						}
+					}
+					// If terminal, remove active request tracking
+					protocol::ToClientTunnelMessageKind::ToClientRequestAbort => {
+						tracing::debug!(request_id=?Uuid::from_bytes(tunnel_msg.request_id), "removing active conn due to close message");
+						let mut active_requests = conn.tunnel_active_requests.lock().await;
+						active_requests.remove(&tunnel_msg.request_id);
+					}
+					protocol::ToClientTunnelMessageKind::ToClientWebSocketOpen(_) => {
+						// Save active request
+						//
+						// This will remove gateway_reply_to from the message since it does not need to be sent to the
+						// client
+						if let Some(reply_to) = tunnel_msg.gateway_reply_to.take() {
+							tracing::debug!(request_id=?Uuid::from_bytes(tunnel_msg.request_id), ?reply_to, "creating active request");
+							let mut active_requests = conn.tunnel_active_requests.lock().await;
+							active_requests.insert(
+								tunnel_msg.request_id,
+								TunnelActiveRequest {
+									gateway_reply_to: reply_to,
+									is_ws: true,
+								},
+							);
+						}
+					}
 					// If terminal, remove active request tracking
 					protocol::ToClientTunnelMessageKind::ToClientWebSocketClose(_) => {
-						tracing::debug!(?tunnel_msg.request_id, "removing active conn due to close message");
+						tracing::debug!(request_id=?Uuid::from_bytes(tunnel_msg.request_id), "removing active conn due to close message");
 						let mut active_requests = conn.tunnel_active_requests.lock().await;
 						active_requests.remove(&tunnel_msg.request_id);
 					}
@@ -67,7 +92,7 @@ pub async fn task(conn: Arc<Conn>, mut sub: Subscriber) -> Result<()> {
 
 		// Forward raw message to WebSocket
 		let serialized_msg =
-			match versioned::ToClient::latest(msg).serialize_version(conn.protocol_version) {
+			match versioned::ToClient::wrap_latest(msg).serialize_version(conn.protocol_version) {
 				Result::Ok(x) => x,
 				Err(err) => {
 					tracing::error!(?err, "failed to serialize tunnel message");
diff --git a/engine/packages/pegboard-serverless/src/lib.rs b/engine/packages/pegboard-serverless/src/lib.rs
index fb58597f64..a5125a90a1 100644
--- a/engine/packages/pegboard-serverless/src/lib.rs
+++ b/engine/packages/pegboard-serverless/src/lib.rs
@@ -383,7 +383,7 @@ async fn outbound_handler(
 					}
 				}
 				Err(sse::Error::StreamEnded) => {
-					tracing::debug!("outbound req stopped early");
+					tracing::debug!(?runner_id, "outbound req stopped early");
 
 					return Ok(());
 				}
@@ -417,7 +417,7 @@ async fn outbound_handler(
 			match event {
 				Ok(sse::Event::Open) => {}
 				Ok(sse::Event::Message(msg)) => {
-					tracing::debug!(%msg.data, "received outbound req message");
+					tracing::debug!(%msg.data, ?runner_id, "received outbound req message");
 
 					// If runner_id is none at this point it means we did not send the stopping signal yet, so
 					// send it now
@@ -451,7 +451,7 @@ async fn outbound_handler(
 	tokio::select! {
 		res = wait_for_shutdown_fut => return res.map_err(Into::into),
 		_ = tokio::time::sleep(DRAIN_GRACE_PERIOD) => {
-			tracing::debug!("reached drain grace period before runner shut down")
+			tracing::debug!(?runner_id, "reached drain grace period before runner shut down")
 		}
 	}
 
@@ -463,15 +463,15 @@ async fn outbound_handler(
 		publish_to_client_stop(ctx, runner_id).await?;
 	}
 
-	tracing::debug!("outbound req stopped");
+	tracing::debug!(?runner_id, "outbound req stopped");
 
 	Ok(())
 }
 
 async fn drain_runner(ctx: &StandaloneCtx, runner_id: Id) -> Result<()> {
 	let res = ctx
-		.signal(pegboard::workflows::runner::Forward {
-			inner: protocol::ToServer::ToServerStopping,
+		.signal(pegboard::workflows::runner::Stop {
+			reset_actor_rescheduling: true,
 		})
 		.to_workflow::<pegboard::workflows::runner::Workflow>()
 		.tag("runner_id", runner_id)
@@ -501,7 +501,7 @@ async fn publish_to_client_stop(ctx: &StandaloneCtx, runner_id: Id) -> Result<()
 	let receiver_subject =
 		pegboard::pubsub_subjects::RunnerReceiverSubject::new(runner_id).to_string();
 
-	let message_serialized = rivet_runner_protocol::versioned::ToClient::latest(
+	let message_serialized = rivet_runner_protocol::versioned::ToClient::wrap_latest(
 		rivet_runner_protocol::ToClient::ToClientClose,
 	)
 	.serialize_with_embedded_version(rivet_runner_protocol::PROTOCOL_VERSION)?;
diff --git a/engine/packages/pegboard/src/keys/ns.rs b/engine/packages/pegboard/src/keys/ns.rs
index 33dc7ddbcc..a23ed5d1cb 100644
--- a/engine/packages/pegboard/src/keys/ns.rs
+++ b/engine/packages/pegboard/src/keys/ns.rs
@@ -52,7 +52,7 @@ impl FormalKey for RunnerAllocIdxKey {
 	}
 
 	fn serialize(&self, value: Self::Value) -> Result<Vec<u8>> {
-		rivet_data::versioned::RunnerAllocIdxKeyData::latest(value.try_into()?)
+		rivet_data::versioned::RunnerAllocIdxKeyData::wrap_latest(value.try_into()?)
 			.serialize_with_embedded_version(
 				rivet_data::PEGBOARD_NAMESPACE_RUNNER_ALLOC_IDX_VERSION,
 			)
@@ -570,7 +570,7 @@ impl FormalKey for ActorByKeyKey {
 	}
 
 	fn serialize(&self, value: Self::Value) -> Result<Vec<u8>> {
-		rivet_data::versioned::ActorByKeyKeyData::latest(value.try_into()?)
+		rivet_data::versioned::ActorByKeyKeyData::wrap_latest(value.try_into()?)
 			.serialize_with_embedded_version(rivet_data::PEGBOARD_NAMESPACE_ACTOR_BY_KEY_VERSION)
 	}
 }
@@ -1186,7 +1186,7 @@ impl FormalKey for RunnerByKeyKey {
 	}
 
 	fn serialize(&self, value: Self::Value) -> Result<Vec<u8>> {
-		rivet_data::versioned::RunnerByKeyKeyData::latest(value.try_into()?)
+		rivet_data::versioned::RunnerByKeyKeyData::wrap_latest(value.try_into()?)
 			.serialize_with_embedded_version(rivet_data::PEGBOARD_NAMESPACE_RUNNER_BY_KEY_VERSION)
 	}
 }
@@ -1247,7 +1247,7 @@ impl FormalKey for ActorNameKey {
 	}
 
 	fn serialize(&self, value: Self::Value) -> Result<Vec<u8>> {
-		rivet_data::versioned::ActorNameKeyData::latest(value.try_into()?)
+		rivet_data::versioned::ActorNameKeyData::wrap_latest(value.try_into()?)
 			.serialize_with_embedded_version(rivet_data::PEGBOARD_NAMESPACE_ACTOR_NAME_VERSION)
 	}
 }
diff --git a/engine/packages/pegboard/src/keys/runner.rs b/engine/packages/pegboard/src/keys/runner.rs
index fdc726f239..9c038b4a73 100644
--- a/engine/packages/pegboard/src/keys/runner.rs
+++ b/engine/packages/pegboard/src/keys/runner.rs
@@ -767,7 +767,7 @@ impl FormalChunkedKey for MetadataKey {
 
 	fn split(&self, value: Self::Value) -> Result<Vec<Vec<u8>>> {
 		Ok(
-			rivet_data::versioned::MetadataKeyData::latest(value.try_into()?)
+			rivet_data::versioned::MetadataKeyData::wrap_latest(value.try_into()?)
 				.serialize_with_embedded_version(rivet_data::PEGBOARD_RUNNER_METADATA_VERSION)?
 				.chunks(universaldb::utils::CHUNK_SIZE)
 				.map(|x| x.to_vec())
diff --git a/engine/packages/pegboard/src/workflows/actor/mod.rs b/engine/packages/pegboard/src/workflows/actor/mod.rs
index cae0baae00..e657fad85f 100644
--- a/engine/packages/pegboard/src/workflows/actor/mod.rs
+++ b/engine/packages/pegboard/src/workflows/actor/mod.rs
@@ -268,6 +268,7 @@ pub async fn pegboard_actor(ctx: &mut WorkflowCtx, input: &Input) -> Result<()>
 							Main::Lost(Lost {
 								generation: state.generation,
 								force_reschedule: false,
+								reset_rescheduling: false,
 							})
 						}
 					} else if let Some(alarm_ts) = state.alarm_ts {
@@ -372,7 +373,7 @@ pub async fn pegboard_actor(ctx: &mut WorkflowCtx, input: &Input) -> Result<()>
 										protocol::ActorStateStopped { code, .. },
 									) => {
 										if let Some(res) =
-											handle_stopped(ctx, &input, state, Some(code), false, false)
+											handle_stopped(ctx, &input, state, Some(code), None)
 												.await?
 										{
 											return Ok(Loop::Break(res));
@@ -393,7 +394,7 @@ pub async fn pegboard_actor(ctx: &mut WorkflowCtx, input: &Input) -> Result<()>
 									state.sleeping = false;
 									state.will_wake = false;
 
-									match runtime::reschedule_actor(ctx, &input, state, false).await? {
+									match runtime::reschedule_actor(ctx, &input, state, false, false).await? {
 										runtime::SpawnActorOutput::Allocated { .. } => {},
 										runtime::SpawnActorOutput::Sleep => {
 											state.sleeping = true;
@@ -434,7 +435,7 @@ pub async fn pegboard_actor(ctx: &mut WorkflowCtx, input: &Input) -> Result<()>
 							}
 
 							if let Some(res) =
-								handle_stopped(ctx, &input, state, None, true, sig.force_reschedule).await?
+								handle_stopped(ctx, &input, state, None, Some(sig)).await?
 							{
 								return Ok(Loop::Break(res));
 							}
@@ -493,10 +494,9 @@ async fn handle_stopped(
 	input: &Input,
 	state: &mut runtime::LifecycleState,
 	code: Option<protocol::StopCode>,
-	lost: bool,
-	force_reschedule: bool,
+	lost_sig: Option<Lost>,
 ) -> Result<Option<runtime::LifecycleRes>> {
-	tracing::debug!(?code, %force_reschedule, "actor stopped");
+	tracing::debug!(?code, ?lost_sig, "actor stopped");
 
 	// Reset retry count on successful exit
 	if let Some(protocol::StopCode::Ok) = code {
@@ -541,7 +541,7 @@ async fn handle_stopped(
 	}
 
 	// Kill old actor if lost (just in case it ended up allocating)
-	if let (true, Some(old_runner_workflow_id)) = (lost, old_runner_workflow_id) {
+	if let (Some(_), Some(old_runner_workflow_id)) = (&lost_sig, old_runner_workflow_id) {
 		ctx.signal(crate::workflows::runner::Command {
 			inner: protocol::Command::CommandStopActor(protocol::CommandStopActor {
 				actor_id: input.actor_id.to_string(),
@@ -553,13 +553,24 @@ async fn handle_stopped(
 		.await?;
 	}
 
+	let (force_reschedule, reset_rescheduling) = if let Some(lost_sig) = &lost_sig {
+		(lost_sig.force_reschedule, lost_sig.reset_rescheduling)
+	} else {
+		(false, false)
+	};
+
 	// Reschedule no matter what
 	if force_reschedule {
-		match runtime::reschedule_actor(ctx, &input, state, true).await? {
+		match runtime::reschedule_actor(ctx, &input, state, true, reset_rescheduling).await? {
 			runtime::SpawnActorOutput::Allocated { .. } => {}
 			// NOTE: This should be unreachable because force_reschedule is true
 			runtime::SpawnActorOutput::Sleep => {
 				state.sleeping = true;
+
+				ctx.activity(runtime::SetSleepingInput {
+					actor_id: input.actor_id,
+				})
+				.await?;
 			}
 			runtime::SpawnActorOutput::Destroy => {
 				// Destroyed early
@@ -578,7 +589,9 @@ async fn handle_stopped(
 
 		match (input.crash_policy, failed) {
 			(CrashPolicy::Restart, true) => {
-				match runtime::reschedule_actor(ctx, &input, state, false).await? {
+				match runtime::reschedule_actor(ctx, &input, state, false, reset_rescheduling)
+					.await?
+				{
 					runtime::SpawnActorOutput::Allocated { .. } => {}
 					// NOTE: Its not possible for `SpawnActorOutput::Sleep` to be returned here, the crash
 					// policy is `Restart`.
@@ -608,7 +621,7 @@ async fn handle_stopped(
 
 				return Ok(Some(runtime::LifecycleRes {
 					generation: state.generation,
-					kill: lost,
+					kill: lost_sig.is_some(),
 				}));
 			}
 		}
@@ -617,7 +630,7 @@ async fn handle_stopped(
 	else if state.will_wake {
 		state.sleeping = false;
 
-		match runtime::reschedule_actor(ctx, &input, state, false).await? {
+		match runtime::reschedule_actor(ctx, &input, state, false, reset_rescheduling).await? {
 			runtime::SpawnActorOutput::Allocated { .. } => {}
 			runtime::SpawnActorOutput::Sleep => {
 				state.sleeping = true;
@@ -676,10 +689,14 @@ pub struct Event {
 #[signal("pegboard_actor_wake")]
 pub struct Wake {}
 
+#[derive(Debug)]
 #[signal("pegboard_actor_lost")]
 pub struct Lost {
 	pub generation: u32,
+	/// Immediately reschedules the actor regardless of its crash policy.
 	pub force_reschedule: bool,
+	/// Resets the rescheduling retry count to 0.
+	pub reset_rescheduling: bool,
 }
 
 #[signal("pegboard_actor_destroy")]
diff --git a/engine/packages/pegboard/src/workflows/actor/runtime.rs b/engine/packages/pegboard/src/workflows/actor/runtime.rs
index e98eb5146a..f4d03e80a5 100644
--- a/engine/packages/pegboard/src/workflows/actor/runtime.rs
+++ b/engine/packages/pegboard/src/workflows/actor/runtime.rs
@@ -614,6 +614,7 @@ pub async fn reschedule_actor(
 	input: &Input,
 	state: &mut LifecycleState,
 	force_reschedule: bool,
+	reset_rescheduling: bool,
 ) -> Result<SpawnActorOutput> {
 	tracing::debug!(actor_id=?input.actor_id, "rescheduling actor");
 
@@ -633,7 +634,7 @@ pub async fn reschedule_actor(
 		})
 		.await?;
 
-	state.reschedule_state.retry_count = if reset {
+	state.reschedule_state.retry_count = if reset || reset_rescheduling {
 		0
 	} else {
 		state.reschedule_state.retry_count + 1
@@ -726,7 +727,7 @@ struct CompareRetryInput {
 async fn compare_retry(ctx: &ActivityCtx, input: &CompareRetryInput) -> Result<(i64, bool)> {
 	let now = util::timestamp::now();
 
-	// If the last retry ts is more than RETRY_RESET_DURATION_MS, reset retry count
+	// If the last retry ts is more than RETRY_RESET_DURATION_MS ago, reset retry count
 	Ok((now, input.last_retry_ts < now - RETRY_RESET_DURATION_MS))
 }
 
diff --git a/engine/packages/pegboard/src/workflows/actor/setup.rs b/engine/packages/pegboard/src/workflows/actor/setup.rs
index 313bc3b369..136bad24a5 100644
--- a/engine/packages/pegboard/src/workflows/actor/setup.rs
+++ b/engine/packages/pegboard/src/workflows/actor/setup.rs
@@ -7,7 +7,7 @@ use super::State;
 
 use crate::{errors, keys};
 
-const MAX_INPUT_SIZE: usize = util::file_size::mebibytes(4) as usize;
+const MAX_INPUT_SIZE: usize = util::size::mebibytes(4) as usize;
 
 #[derive(Debug, Clone, Serialize, Deserialize, Hash)]
 pub struct ValidateInput {
diff --git a/engine/packages/pegboard/src/workflows/runner.rs b/engine/packages/pegboard/src/workflows/runner.rs
index 610b9ac370..64f5594bbb 100644
--- a/engine/packages/pegboard/src/workflows/runner.rs
+++ b/engine/packages/pegboard/src/workflows/runner.rs
@@ -166,6 +166,7 @@ pub async fn pegboard_runner(ctx: &mut WorkflowCtx, input: &Input) -> Result<()>
 							for event in &events {
 								if event.index <= state.last_event_idx {
 									tracing::warn!(idx=%event.index, "event already received, ignoring");
+									continue;
 								}
 
 								let actor_id =
@@ -227,40 +228,7 @@ pub async fn pegboard_runner(ctx: &mut WorkflowCtx, input: &Input) -> Result<()>
 							ctx.activity(AckCommandsInput { last_command_idx }).await?;
 						}
 						protocol::ToServer::ToServerStopping => {
-							if !state.draining {
-								// The workflow will enter a draining state where it can still process signals if
-								// needed. After RUNNER_LOST_THRESHOLD_MS it will exit this loop and stop.
-								state.draining = true;
-
-								// Can't parallelize these two activities, requires reading from state
-								ctx.activity(ClearDbInput {
-									runner_id: input.runner_id,
-									name: input.name.clone(),
-									key: input.key.clone(),
-									update_state: RunnerState::Draining,
-								})
-								.await?;
-
-								let actors = ctx
-									.activity(FetchRemainingActorsInput {
-										runner_id: input.runner_id,
-									})
-									.await?;
-
-								// Set all remaining actors to lost immediately
-								if !actors.is_empty() {
-									for (actor_id, generation) in &actors {
-										ctx.signal(crate::workflows::actor::Lost {
-											generation: *generation,
-											force_reschedule: false,
-										})
-										.to_workflow::<crate::workflows::actor::Workflow>()
-										.tag("actor_id", actor_id)
-										.send()
-										.await?;
-									}
-								}
-							}
+							handle_stopping(ctx, &input, state, false).await?;
 						}
 						protocol::ToServer::ToServerPing(_)
 						| protocol::ToServer::ToServerKvRequest(_)
@@ -291,6 +259,7 @@ pub async fn pegboard_runner(ctx: &mut WorkflowCtx, input: &Input) -> Result<()>
 								// Because this is a race condition, we want the actor to reschedule
 								// regardless of its crash policy
 								force_reschedule: true,
+								reset_rescheduling: true,
 							})
 							.to_workflow::<crate::workflows::actor::Workflow>()
 							.tag("actor_id", actor_id)
@@ -347,14 +316,17 @@ pub async fn pegboard_runner(ctx: &mut WorkflowCtx, input: &Input) -> Result<()>
 							.await?;
 					}
 				}
+				Some(Main::Stop(sig)) => {
+					handle_stopping(ctx, &input, state, sig.reset_actor_rescheduling).await?;
+				}
 				None => {
-					if state.draining
-						|| ctx
-							.activity(CheckExpiredInput {
-								runner_id: input.runner_id,
-							})
-							.await?
-					{
+					let expired = ctx
+						.activity(CheckExpiredInput {
+							runner_id: input.runner_id,
+						})
+						.await?;
+
+					if state.draining || expired {
 						return Ok(Loop::Break(()));
 					}
 				}
@@ -386,6 +358,7 @@ pub async fn pegboard_runner(ctx: &mut WorkflowCtx, input: &Input) -> Result<()>
 			.signal(crate::workflows::actor::Lost {
 				generation,
 				force_reschedule: false,
+				reset_rescheduling: false,
 			})
 			.to_workflow::<crate::workflows::actor::Workflow>()
 			.tag("actor_id", actor_id)
@@ -416,6 +389,51 @@ pub async fn pegboard_runner(ctx: &mut WorkflowCtx, input: &Input) -> Result<()>
 	Ok(())
 }
 
+async fn handle_stopping(
+	ctx: &mut WorkflowCtx,
+	input: &Input,
+	state: &mut LifecycleState,
+	reset_actor_rescheduling: bool,
+) -> Result<()> {
+	if !state.draining {
+		// The workflow will enter a draining state where it can still process signals if
+		// needed. After RUNNER_LOST_THRESHOLD_MS it will exit this loop and stop.
+		state.draining = true;
+
+		// Can't parallelize these two activities, requires reading from state
+		ctx.activity(ClearDbInput {
+			runner_id: input.runner_id,
+			name: input.name.clone(),
+			key: input.key.clone(),
+			update_state: RunnerState::Draining,
+		})
+		.await?;
+
+		let actors = ctx
+			.activity(FetchRemainingActorsInput {
+				runner_id: input.runner_id,
+			})
+			.await?;
+
+		// Set all remaining actors to lost immediately
+		if !actors.is_empty() {
+			for (actor_id, generation) in &actors {
+				ctx.signal(crate::workflows::actor::Lost {
+					generation: *generation,
+					force_reschedule: false,
+					reset_rescheduling: reset_actor_rescheduling,
+				})
+				.to_workflow::<crate::workflows::actor::Workflow>()
+				.tag("actor_id", actor_id)
+				.send()
+				.await?;
+			}
+		}
+	}
+
+	Ok(())
+}
+
 #[derive(Debug, Serialize, Deserialize)]
 struct LifecycleState {
 	draining: bool,
@@ -1124,7 +1142,7 @@ async fn send_message_to_runner(ctx: &ActivityCtx, input: &SendMessageToRunnerIn
 	let receiver_subject =
 		crate::pubsub_subjects::RunnerReceiverSubject::new(input.runner_id).to_string();
 
-	let message_serialized = versioned::ToClient::latest(input.message.clone())
+	let message_serialized = versioned::ToClient::wrap_latest(input.message.clone())
 		.serialize_with_embedded_version(PROTOCOL_VERSION)?;
 
 	ctx.ups()?
@@ -1137,6 +1155,11 @@ async fn send_message_to_runner(ctx: &ActivityCtx, input: &SendMessageToRunnerIn
 #[signal("pegboard_runner_check_queue")]
 pub struct CheckQueue {}
 
+#[signal("pegboard_runner_stop")]
+pub struct Stop {
+	pub reset_actor_rescheduling: bool,
+}
+
 #[signal("pegboard_runner_command")]
 pub struct Command {
 	pub inner: protocol::Command,
@@ -1152,4 +1175,5 @@ join_signal!(Main {
 	// Forwarded from the ws to this workflow
 	Forward(Forward),
 	CheckQueue,
+	Stop,
 });
diff --git a/engine/packages/universaldb/src/driver/rocksdb/transaction_task.rs b/engine/packages/universaldb/src/driver/rocksdb/transaction_task.rs
index e43fbe6a1d..704c4f3823 100644
--- a/engine/packages/universaldb/src/driver/rocksdb/transaction_task.rs
+++ b/engine/packages/universaldb/src/driver/rocksdb/transaction_task.rs
@@ -137,7 +137,7 @@ impl TransactionTask {
 		}
 	}
 
-	fn create_transaction(&self) -> RocksDbTransaction<OptimisticTransactionDB> {
+	fn create_transaction(&self) -> RocksDbTransaction<'_, OptimisticTransactionDB> {
 		let write_opts = WriteOptions::default();
 		let txn_opts = rocksdb::OptimisticTransactionOptions::default();
 		self.db.transaction_opt(&write_opts, &txn_opts)
diff --git a/engine/packages/universaldb/src/tx_ops.rs b/engine/packages/universaldb/src/tx_ops.rs
index aedd4f41db..878ae168e8 100644
--- a/engine/packages/universaldb/src/tx_ops.rs
+++ b/engine/packages/universaldb/src/tx_ops.rs
@@ -34,17 +34,6 @@ pub enum Operation {
 	},
 }
 
-impl Operation {
-	pub fn sorting_key(&self) -> &[u8] {
-		match self {
-			Operation::Set { key, .. } => key,
-			Operation::Clear { key } => key,
-			Operation::ClearRange { begin, .. } => begin,
-			Operation::AtomicOp { key, .. } => key,
-		}
-	}
-}
-
 #[derive(Debug, Clone)]
 pub enum GetOutput {
 	Value(Vec<u8>),
diff --git a/engine/packages/universalpubsub/src/chunking.rs b/engine/packages/universalpubsub/src/chunking.rs
index 2c230efbb9..2d276233fc 100644
--- a/engine/packages/universalpubsub/src/chunking.rs
+++ b/engine/packages/universalpubsub/src/chunking.rs
@@ -144,7 +144,7 @@ pub fn split_payload_into_chunks(
 	let start_ups_message = rivet_ups_protocol::UpsMessage {
 		body: MessageBody::MessageStart(start_message),
 	};
-	let start_overhead = UpsMessage::latest(start_ups_message)
+	let start_overhead = UpsMessage::wrap_latest(start_ups_message)
 		.serialize_with_embedded_version(PROTOCOL_VERSION)?
 		.len();
 
@@ -157,7 +157,7 @@ pub fn split_payload_into_chunks(
 	let chunk_ups_message = rivet_ups_protocol::UpsMessage {
 		body: MessageBody::MessageChunk(chunk_message),
 	};
-	let chunk_overhead = UpsMessage::latest(chunk_ups_message)
+	let chunk_overhead = UpsMessage::wrap_latest(chunk_ups_message)
 		.serialize_with_embedded_version(PROTOCOL_VERSION)?
 		.len();
 
@@ -222,5 +222,5 @@ pub fn encode_chunk(
 	};
 
 	let ups_message = rivet_ups_protocol::UpsMessage { body };
-	UpsMessage::latest(ups_message).serialize_with_embedded_version(PROTOCOL_VERSION)
+	UpsMessage::wrap_latest(ups_message).serialize_with_embedded_version(PROTOCOL_VERSION)
 }
diff --git a/engine/packages/universalpubsub/src/driver/postgres/mod.rs b/engine/packages/universalpubsub/src/driver/postgres/mod.rs
index c2f20b68d0..ca801e81b2 100644
--- a/engine/packages/universalpubsub/src/driver/postgres/mod.rs
+++ b/engine/packages/universalpubsub/src/driver/postgres/mod.rs
@@ -292,10 +292,9 @@ impl PubSubDriver for PostgresDriver {
 				// Try to LISTEN if client is available, but don't fail if disconnected
 				// The reconnection logic will handle re-subscribing
 				if let Some(client) = self.client.lock().await.clone() {
-					let span = tracing::trace_span!("pg_listen");
 					match client
 						.execute(&format!("LISTEN \"{hashed}\""), &[])
-						.instrument(span)
+						.instrument(tracing::trace_span!("pg_listen"))
 						.await
 					{
 						Result::Ok(_) => {
@@ -359,7 +358,7 @@ impl PubSubDriver for PostgresDriver {
 		// Retry getting a connection from the pool with backoff in case the connection is
 		// currently disconnected
 		let mut backoff = Backoff::default();
-		let mut last_error = None;
+		let mut last_error;
 
 		loop {
 			match self.pool.get().await {
@@ -368,10 +367,9 @@ impl PubSubDriver for PostgresDriver {
 					match conn.execute("SELECT 1", &[]).await {
 						Result::Ok(_) => {
 							// Connection is good, use it for NOTIFY
-							let span = tracing::trace_span!("pg_notify");
 							match conn
 								.execute(&format!("NOTIFY \"{hashed}\", '{encoded}'"), &[])
-								.instrument(span)
+								.instrument(tracing::trace_span!("pg_notify"))
 								.await
 							{
 								Result::Ok(_) => return Ok(()),
diff --git a/engine/packages/util/src/lib.rs b/engine/packages/util/src/lib.rs
index 39b01b71bb..213e0be058 100644
--- a/engine/packages/util/src/lib.rs
+++ b/engine/packages/util/src/lib.rs
@@ -6,7 +6,6 @@ pub mod billing;
 pub mod check;
 pub mod duration;
 pub mod faker;
-pub mod file_size;
 pub mod format;
 pub mod future;
 pub mod geo;
@@ -14,6 +13,7 @@ pub mod math;
 pub mod req;
 pub mod serde;
 pub mod signal;
+pub mod size;
 pub mod sort;
 pub mod timestamp;
 pub mod url;
diff --git a/engine/packages/util/src/file_size.rs b/engine/packages/util/src/size.rs
similarity index 100%
rename from engine/packages/util/src/file_size.rs
rename to engine/packages/util/src/size.rs
diff --git a/engine/sdks/rust/data/src/versioned/mod.rs b/engine/sdks/rust/data/src/versioned/mod.rs
index ca5b6ced97..f6814cad32 100644
--- a/engine/sdks/rust/data/src/versioned/mod.rs
+++ b/engine/sdks/rust/data/src/versioned/mod.rs
@@ -14,11 +14,11 @@ pub enum RunnerAllocIdxKeyData {
 impl OwnedVersionedData for RunnerAllocIdxKeyData {
 	type Latest = pegboard_namespace_runner_alloc_idx_v1::Data;
 
-	fn latest(latest: pegboard_namespace_runner_alloc_idx_v1::Data) -> Self {
+	fn wrap_latest(latest: pegboard_namespace_runner_alloc_idx_v1::Data) -> Self {
 		RunnerAllocIdxKeyData::V1(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
 		if let RunnerAllocIdxKeyData::V1(data) = self {
 			Ok(data)
@@ -48,11 +48,11 @@ pub enum MetadataKeyData {
 impl OwnedVersionedData for MetadataKeyData {
 	type Latest = pegboard_runner_metadata_v1::Data;
 
-	fn latest(latest: pegboard_runner_metadata_v1::Data) -> Self {
+	fn wrap_latest(latest: pegboard_runner_metadata_v1::Data) -> Self {
 		MetadataKeyData::V1(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
 		if let MetadataKeyData::V1(data) = self {
 			Ok(data)
@@ -82,11 +82,11 @@ pub enum ActorByKeyKeyData {
 impl OwnedVersionedData for ActorByKeyKeyData {
 	type Latest = pegboard_namespace_actor_by_key_v1::Data;
 
-	fn latest(latest: pegboard_namespace_actor_by_key_v1::Data) -> Self {
+	fn wrap_latest(latest: pegboard_namespace_actor_by_key_v1::Data) -> Self {
 		ActorByKeyKeyData::V1(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
 		if let ActorByKeyKeyData::V1(data) = self {
 			Ok(data)
@@ -116,11 +116,11 @@ pub enum RunnerByKeyKeyData {
 impl OwnedVersionedData for RunnerByKeyKeyData {
 	type Latest = pegboard_namespace_runner_by_key_v1::Data;
 
-	fn latest(latest: pegboard_namespace_runner_by_key_v1::Data) -> Self {
+	fn wrap_latest(latest: pegboard_namespace_runner_by_key_v1::Data) -> Self {
 		RunnerByKeyKeyData::V1(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
 		if let RunnerByKeyKeyData::V1(data) = self {
 			Ok(data)
@@ -150,11 +150,11 @@ pub enum ActorNameKeyData {
 impl OwnedVersionedData for ActorNameKeyData {
 	type Latest = pegboard_namespace_actor_name_v1::Data;
 
-	fn latest(latest: pegboard_namespace_actor_name_v1::Data) -> Self {
+	fn wrap_latest(latest: pegboard_namespace_actor_name_v1::Data) -> Self {
 		ActorNameKeyData::V1(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
 		if let ActorNameKeyData::V1(data) = self {
 			Ok(data)
diff --git a/engine/sdks/rust/data/src/versioned/namespace_runner_config.rs b/engine/sdks/rust/data/src/versioned/namespace_runner_config.rs
index 252b48f47f..c902225806 100644
--- a/engine/sdks/rust/data/src/versioned/namespace_runner_config.rs
+++ b/engine/sdks/rust/data/src/versioned/namespace_runner_config.rs
@@ -11,11 +11,11 @@ pub enum NamespaceRunnerConfig {
 impl OwnedVersionedData for NamespaceRunnerConfig {
 	type Latest = namespace_runner_config_v2::RunnerConfig;
 
-	fn latest(latest: namespace_runner_config_v2::RunnerConfig) -> Self {
+	fn wrap_latest(latest: namespace_runner_config_v2::RunnerConfig) -> Self {
 		NamespaceRunnerConfig::V2(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
 		if let NamespaceRunnerConfig::V2(data) = self {
 			Ok(data)
@@ -87,7 +87,7 @@ impl NamespaceRunnerConfig {
 		match self {
 			NamespaceRunnerConfig::V1(_) => Ok(self),
 			NamespaceRunnerConfig::V2(config) => {
-				let namespace_runner_config_v2::RunnerConfig { metadata, kind } = config;
+				let namespace_runner_config_v2::RunnerConfig { kind, .. } = config;
 
 				match kind {
 					namespace_runner_config_v2::RunnerConfigKind::Serverless(serverless) => {
diff --git a/engine/sdks/rust/epoxy-protocol/src/versioned.rs b/engine/sdks/rust/epoxy-protocol/src/versioned.rs
index 8475286679..20a9507685 100644
--- a/engine/sdks/rust/epoxy-protocol/src/versioned.rs
+++ b/engine/sdks/rust/epoxy-protocol/src/versioned.rs
@@ -10,11 +10,11 @@ pub enum Request {
 impl OwnedVersionedData for Request {
 	type Latest = v1::Request;
 
-	fn latest(latest: v1::Request) -> Self {
+	fn wrap_latest(latest: v1::Request) -> Self {
 		Request::V1(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
 		if let Request::V1(data) = self {
 			Ok(data)
@@ -50,11 +50,11 @@ pub enum Response {
 impl OwnedVersionedData for Response {
 	type Latest = v1::Response;
 
-	fn latest(latest: v1::Response) -> Self {
+	fn wrap_latest(latest: v1::Response) -> Self {
 		Response::V1(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
 		if let Response::V1(data) = self {
 			Ok(data)
@@ -90,11 +90,11 @@ pub enum LogEntry {
 impl OwnedVersionedData for LogEntry {
 	type Latest = v1::LogEntry;
 
-	fn latest(latest: v1::LogEntry) -> Self {
+	fn wrap_latest(latest: v1::LogEntry) -> Self {
 		LogEntry::V1(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
 		if let LogEntry::V1(data) = self {
 			Ok(data)
@@ -134,11 +134,11 @@ pub enum ClusterConfig {
 impl OwnedVersionedData for ClusterConfig {
 	type Latest = v1::ClusterConfig;
 
-	fn latest(latest: v1::ClusterConfig) -> Self {
+	fn wrap_latest(latest: v1::ClusterConfig) -> Self {
 		ClusterConfig::V1(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
 		if let ClusterConfig::V1(data) = self {
 			Ok(data)
@@ -178,11 +178,11 @@ pub enum Ballot {
 impl OwnedVersionedData for Ballot {
 	type Latest = v1::Ballot;
 
-	fn latest(latest: v1::Ballot) -> Self {
+	fn wrap_latest(latest: v1::Ballot) -> Self {
 		Ballot::V1(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
 		if let Ballot::V1(data) = self {
 			Ok(data)
diff --git a/engine/sdks/rust/runner-protocol/build.rs b/engine/sdks/rust/runner-protocol/build.rs
index 4f1bb3902a..3d8188330a 100644
--- a/engine/sdks/rust/runner-protocol/build.rs
+++ b/engine/sdks/rust/runner-protocol/build.rs
@@ -22,7 +22,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
 	vbare_compiler::process_schemas_with_config(&schema_dir, &cfg)?;
 
 	// TypeScript SDK generation
-	let cli_js_path = workspace_root.join("node_modules/@bare-ts/tools/dist/bin/cli.js");
+	let cli_js_path = workspace_root
+		.parent()
+		.unwrap()
+		.join("node_modules/@bare-ts/tools/dist/bin/cli.js");
 	if cli_js_path.exists() {
 		typescript::generate_sdk(&schema_dir);
 	} else {
@@ -59,16 +62,20 @@ mod typescript {
 			panic!("Failed to create SDK directory: {}", e);
 		}
 
-		let output =
-			Command::new(workspace_root.join("node_modules/@bare-ts/tools/dist/bin/cli.js"))
-				.arg("compile")
-				.arg("--generator")
-				.arg("ts")
-				.arg(highest_version_path)
-				.arg("-o")
-				.arg(src_dir.join("index.ts"))
-				.output()
-				.expect("Failed to execute bare compiler for TypeScript");
+		let output = Command::new(
+			workspace_root
+				.parent()
+				.unwrap()
+				.join("node_modules/@bare-ts/tools/dist/bin/cli.js"),
+		)
+		.arg("compile")
+		.arg("--generator")
+		.arg("ts")
+		.arg(highest_version_path)
+		.arg("-o")
+		.arg(src_dir.join("index.ts"))
+		.output()
+		.expect("Failed to execute bare compiler for TypeScript");
 
 		if !output.status.success() {
 			panic!(
diff --git a/engine/sdks/rust/runner-protocol/src/lib.rs b/engine/sdks/rust/runner-protocol/src/lib.rs
index 676c99e464..04553acb49 100644
--- a/engine/sdks/rust/runner-protocol/src/lib.rs
+++ b/engine/sdks/rust/runner-protocol/src/lib.rs
@@ -2,6 +2,6 @@ pub mod generated;
 pub mod versioned;
 
 // Re-export latest
-pub use generated::v1::*;
+pub use generated::v2::*;
 
-pub const PROTOCOL_VERSION: u16 = 1;
+pub const PROTOCOL_VERSION: u16 = 2;
diff --git a/engine/sdks/rust/runner-protocol/src/versioned.rs b/engine/sdks/rust/runner-protocol/src/versioned.rs
index 75f6fa1167..eb95c194c5 100644
--- a/engine/sdks/rust/runner-protocol/src/versioned.rs
+++ b/engine/sdks/rust/runner-protocol/src/versioned.rs
@@ -1,22 +1,22 @@
 use anyhow::{Ok, Result, bail};
 use vbare::OwnedVersionedData;
 
-use crate::{PROTOCOL_VERSION, generated::v1};
+use crate::generated::{v1, v2};
 
 pub enum ToClient {
 	V1(v1::ToClient),
+	V2(v2::ToClient),
 }
 
 impl OwnedVersionedData for ToClient {
-	type Latest = v1::ToClient;
+	type Latest = v2::ToClient;
 
-	fn latest(latest: v1::ToClient) -> Self {
-		ToClient::V1(latest)
+	fn wrap_latest(latest: v2::ToClient) -> Self {
+		ToClient::V2(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
-		#[allow(irrefutable_let_patterns)]
-		if let ToClient::V1(data) = self {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
+		if let ToClient::V2(data) = self {
 			Ok(data)
 		} else {
 			bail!("version not latest");
@@ -26,6 +26,7 @@ impl OwnedVersionedData for ToClient {
 	fn deserialize_version(payload: &[u8], version: u16) -> Result<Self> {
 		match version {
 			1 => Ok(ToClient::V1(serde_bare::from_slice(payload)?)),
+			2 => Ok(ToClient::V2(serde_bare::from_slice(payload)?)),
 			_ => bail!("invalid version: {version}"),
 		}
 	}
@@ -33,24 +34,177 @@ impl OwnedVersionedData for ToClient {
 	fn serialize_version(self, _version: u16) -> Result<Vec<u8>> {
 		match self {
 			ToClient::V1(data) => serde_bare::to_vec(&data).map_err(Into::into),
+			ToClient::V2(data) => serde_bare::to_vec(&data).map_err(Into::into),
+		}
+	}
+
+	fn deserialize_converters() -> Vec<impl Fn(Self) -> Result<Self>> {
+		vec![Self::v1_to_v2]
+	}
+
+	fn serialize_converters() -> Vec<impl Fn(Self) -> Result<Self>> {
+		vec![Self::v2_to_v1]
+	}
+}
+
+impl ToClient {
+	fn v1_to_v2(self) -> Result<Self> {
+		match self {
+			ToClient::V1(x) => {
+				let inner = match x {
+					v1::ToClient::ToClientInit(init) => {
+						v2::ToClient::ToClientInit(v2::ToClientInit {
+							runner_id: init.runner_id,
+							last_event_idx: init.last_event_idx,
+							metadata: v2::ProtocolMetadata {
+								runner_lost_threshold: init.metadata.runner_lost_threshold,
+							},
+						})
+					}
+					v1::ToClient::ToClientClose => v2::ToClient::ToClientClose,
+					v1::ToClient::ToClientCommands(commands) => v2::ToClient::ToClientCommands(
+						commands
+							.into_iter()
+							.map(|cmd| v2::CommandWrapper {
+								index: cmd.index,
+								inner: match cmd.inner {
+									v1::Command::CommandStartActor(start) => {
+										v2::Command::CommandStartActor(v2::CommandStartActor {
+											actor_id: start.actor_id,
+											generation: start.generation,
+											config: v2::ActorConfig {
+												name: start.config.name,
+												key: start.config.key,
+												create_ts: start.config.create_ts,
+												input: start.config.input,
+											},
+										})
+									}
+									v1::Command::CommandStopActor(stop) => {
+										v2::Command::CommandStopActor(v2::CommandStopActor {
+											actor_id: stop.actor_id,
+											generation: stop.generation,
+										})
+									}
+								},
+							})
+							.collect(),
+					),
+					v1::ToClient::ToClientAckEvents(ack) => {
+						v2::ToClient::ToClientAckEvents(v2::ToClientAckEvents {
+							last_event_idx: ack.last_event_idx,
+						})
+					}
+					v1::ToClient::ToClientKvResponse(resp) => {
+						v2::ToClient::ToClientKvResponse(v2::ToClientKvResponse {
+							request_id: resp.request_id,
+							data: convert_kv_response_data_v1_to_v2(resp.data),
+						})
+					}
+					v1::ToClient::ToClientTunnelMessage(msg) => {
+						v2::ToClient::ToClientTunnelMessage(v2::ToClientTunnelMessage {
+							request_id: msg.request_id,
+							message_id: msg.message_id,
+							message_kind: convert_to_client_tunnel_message_kind_v1_to_v2(
+								msg.message_kind,
+							),
+							gateway_reply_to: msg.gateway_reply_to,
+						})
+					}
+				};
+
+				Ok(ToClient::V2(inner))
+			}
+			value @ ToClient::V2(_) => Ok(value),
+		}
+	}
+
+	fn v2_to_v1(self) -> Result<Self> {
+		match self {
+			ToClient::V1(_) => Ok(self),
+			ToClient::V2(x) => {
+				let inner = match x {
+					v2::ToClient::ToClientInit(init) => {
+						v1::ToClient::ToClientInit(v1::ToClientInit {
+							runner_id: init.runner_id,
+							last_event_idx: init.last_event_idx,
+							metadata: v1::ProtocolMetadata {
+								runner_lost_threshold: init.metadata.runner_lost_threshold,
+							},
+						})
+					}
+					v2::ToClient::ToClientClose => v1::ToClient::ToClientClose,
+					v2::ToClient::ToClientCommands(commands) => v1::ToClient::ToClientCommands(
+						commands
+							.into_iter()
+							.map(|cmd| v1::CommandWrapper {
+								index: cmd.index,
+								inner: match cmd.inner {
+									v2::Command::CommandStartActor(start) => {
+										v1::Command::CommandStartActor(v1::CommandStartActor {
+											actor_id: start.actor_id,
+											generation: start.generation,
+											config: v1::ActorConfig {
+												name: start.config.name,
+												key: start.config.key,
+												create_ts: start.config.create_ts,
+												input: start.config.input,
+											},
+										})
+									}
+									v2::Command::CommandStopActor(stop) => {
+										v1::Command::CommandStopActor(v1::CommandStopActor {
+											actor_id: stop.actor_id,
+											generation: stop.generation,
+										})
+									}
+								},
+							})
+							.collect(),
+					),
+					v2::ToClient::ToClientAckEvents(ack) => {
+						v1::ToClient::ToClientAckEvents(v1::ToClientAckEvents {
+							last_event_idx: ack.last_event_idx,
+						})
+					}
+					v2::ToClient::ToClientKvResponse(resp) => {
+						v1::ToClient::ToClientKvResponse(v1::ToClientKvResponse {
+							request_id: resp.request_id,
+							data: convert_kv_response_data_v2_to_v1(resp.data),
+						})
+					}
+					v2::ToClient::ToClientTunnelMessage(msg) => {
+						v1::ToClient::ToClientTunnelMessage(v1::ToClientTunnelMessage {
+							request_id: msg.request_id,
+							message_id: msg.message_id,
+							message_kind: convert_to_client_tunnel_message_kind_v2_to_v1(
+								msg.message_kind,
+							)?,
+							gateway_reply_to: msg.gateway_reply_to,
+						})
+					}
+				};
+
+				Ok(ToClient::V1(inner))
+			}
 		}
 	}
 }
 
 pub enum ToServer {
 	V1(v1::ToServer),
+	V2(v2::ToServer),
 }
 
 impl OwnedVersionedData for ToServer {
-	type Latest = v1::ToServer;
+	type Latest = v2::ToServer;
 
-	fn latest(latest: v1::ToServer) -> Self {
-		ToServer::V1(latest)
+	fn wrap_latest(latest: v2::ToServer) -> Self {
+		ToServer::V2(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
-		#[allow(irrefutable_let_patterns)]
-		if let ToServer::V1(data) = self {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
+		if let ToServer::V2(data) = self {
 			Ok(data)
 		} else {
 			bail!("version not latest");
@@ -60,6 +214,7 @@ impl OwnedVersionedData for ToServer {
 	fn deserialize_version(payload: &[u8], version: u16) -> Result<Self> {
 		match version {
 			1 => Ok(ToServer::V1(serde_bare::from_slice(payload)?)),
+			2 => Ok(ToServer::V2(serde_bare::from_slice(payload)?)),
 			_ => bail!("invalid version: {version}"),
 		}
 	}
@@ -67,24 +222,170 @@ impl OwnedVersionedData for ToServer {
 	fn serialize_version(self, _version: u16) -> Result<Vec<u8>> {
 		match self {
 			ToServer::V1(data) => serde_bare::to_vec(&data).map_err(Into::into),
+			ToServer::V2(data) => serde_bare::to_vec(&data).map_err(Into::into),
+		}
+	}
+
+	fn deserialize_converters() -> Vec<impl Fn(Self) -> Result<Self>> {
+		vec![Self::v1_to_v2]
+	}
+
+	fn serialize_converters() -> Vec<impl Fn(Self) -> Result<Self>> {
+		vec![Self::v2_to_v1]
+	}
+}
+
+impl ToServer {
+	fn v1_to_v2(self) -> Result<Self> {
+		match self {
+			ToServer::V1(x) => {
+				let inner = match x {
+					v1::ToServer::ToServerInit(init) => {
+						v2::ToServer::ToServerInit(v2::ToServerInit {
+							name: init.name,
+							version: init.version,
+							total_slots: init.total_slots,
+							last_command_idx: init.last_command_idx,
+							prepopulate_actor_names: init.prepopulate_actor_names.map(|map| {
+								map.into_iter()
+									.map(|(k, v)| {
+										(
+											k,
+											v2::ActorName {
+												metadata: v.metadata,
+											},
+										)
+									})
+									.collect()
+							}),
+							metadata: init.metadata,
+						})
+					}
+					v1::ToServer::ToServerEvents(events) => v2::ToServer::ToServerEvents(
+						events
+							.into_iter()
+							.map(|event| v2::EventWrapper {
+								index: event.index,
+								inner: convert_event_v1_to_v2(event.inner),
+							})
+							.collect(),
+					),
+					v1::ToServer::ToServerAckCommands(ack) => {
+						v2::ToServer::ToServerAckCommands(v2::ToServerAckCommands {
+							last_command_idx: ack.last_command_idx,
+						})
+					}
+					v1::ToServer::ToServerStopping => v2::ToServer::ToServerStopping,
+					v1::ToServer::ToServerPing(ping) => {
+						v2::ToServer::ToServerPing(v2::ToServerPing { ts: ping.ts })
+					}
+					v1::ToServer::ToServerKvRequest(req) => {
+						v2::ToServer::ToServerKvRequest(v2::ToServerKvRequest {
+							actor_id: req.actor_id,
+							request_id: req.request_id,
+							data: convert_kv_request_data_v1_to_v2(req.data),
+						})
+					}
+					v1::ToServer::ToServerTunnelMessage(msg) => {
+						v2::ToServer::ToServerTunnelMessage(v2::ToServerTunnelMessage {
+							request_id: msg.request_id,
+							message_id: msg.message_id,
+							message_kind: convert_to_server_tunnel_message_kind_v1_to_v2(
+								msg.message_kind,
+							),
+						})
+					}
+				};
+
+				Ok(ToServer::V2(inner))
+			}
+			value @ ToServer::V2(_) => Ok(value),
+		}
+	}
+
+	fn v2_to_v1(self) -> Result<Self> {
+		match self {
+			ToServer::V1(_) => Ok(self),
+			ToServer::V2(x) => {
+				let inner = match x {
+					v2::ToServer::ToServerInit(init) => {
+						v1::ToServer::ToServerInit(v1::ToServerInit {
+							name: init.name,
+							version: init.version,
+							total_slots: init.total_slots,
+							last_command_idx: init.last_command_idx,
+							prepopulate_actor_names: init.prepopulate_actor_names.map(|map| {
+								map.into_iter()
+									.map(|(k, v)| {
+										(
+											k,
+											v1::ActorName {
+												metadata: v.metadata,
+											},
+										)
+									})
+									.collect()
+							}),
+							metadata: init.metadata,
+						})
+					}
+					v2::ToServer::ToServerEvents(events) => v1::ToServer::ToServerEvents(
+						events
+							.into_iter()
+							.map(|event| v1::EventWrapper {
+								index: event.index,
+								inner: convert_event_v2_to_v1(event.inner),
+							})
+							.collect(),
+					),
+					v2::ToServer::ToServerAckCommands(ack) => {
+						v1::ToServer::ToServerAckCommands(v1::ToServerAckCommands {
+							last_command_idx: ack.last_command_idx,
+						})
+					}
+					v2::ToServer::ToServerStopping => v1::ToServer::ToServerStopping,
+					v2::ToServer::ToServerPing(ping) => {
+						v1::ToServer::ToServerPing(v1::ToServerPing { ts: ping.ts })
+					}
+					v2::ToServer::ToServerKvRequest(req) => {
+						v1::ToServer::ToServerKvRequest(v1::ToServerKvRequest {
+							actor_id: req.actor_id,
+							request_id: req.request_id,
+							data: convert_kv_request_data_v2_to_v1(req.data),
+						})
+					}
+					v2::ToServer::ToServerTunnelMessage(msg) => {
+						v1::ToServer::ToServerTunnelMessage(v1::ToServerTunnelMessage {
+							request_id: msg.request_id,
+							message_id: msg.message_id,
+							message_kind: convert_to_server_tunnel_message_kind_v2_to_v1(
+								msg.message_kind,
+							)?,
+						})
+					}
+				};
+
+				Ok(ToServer::V1(inner))
+			}
 		}
 	}
 }
 
 pub enum ToGateway {
-	V1(v1::ToGateway),
+	// No change between v1 and v2
+	V2(v2::ToGateway),
 }
 
 impl OwnedVersionedData for ToGateway {
-	type Latest = v1::ToGateway;
+	type Latest = v2::ToGateway;
 
-	fn latest(latest: v1::ToGateway) -> Self {
-		ToGateway::V1(latest)
+	fn wrap_latest(latest: v2::ToGateway) -> Self {
+		ToGateway::V2(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
-		if let ToGateway::V1(data) = self {
+		if let ToGateway::V2(data) = self {
 			Ok(data)
 		} else {
 			bail!("version not latest");
@@ -93,38 +394,33 @@ impl OwnedVersionedData for ToGateway {
 
 	fn deserialize_version(payload: &[u8], version: u16) -> Result<Self> {
 		match version {
-			1 => Ok(ToGateway::V1(serde_bare::from_slice(payload)?)),
+			1 | 2 => Ok(ToGateway::V2(serde_bare::from_slice(payload)?)),
 			_ => bail!("invalid version: {version}"),
 		}
 	}
 
 	fn serialize_version(self, _version: u16) -> Result<Vec<u8>> {
 		match self {
-			ToGateway::V1(data) => serde_bare::to_vec(&data).map_err(Into::into),
+			ToGateway::V2(data) => serde_bare::to_vec(&data).map_err(Into::into),
 		}
 	}
 }
 
-impl ToGateway {
-	pub fn serialize(self) -> Result<Vec<u8>> {
-		<Self as OwnedVersionedData>::serialize(self, PROTOCOL_VERSION)
-	}
-}
-
 pub enum ToServerlessServer {
-	V1(v1::ToServerlessServer),
+	// No change between v1 and v2
+	V2(v2::ToServerlessServer),
 }
 
 impl OwnedVersionedData for ToServerlessServer {
-	type Latest = v1::ToServerlessServer;
+	type Latest = v2::ToServerlessServer;
 
-	fn latest(latest: v1::ToServerlessServer) -> Self {
-		ToServerlessServer::V1(latest)
+	fn wrap_latest(latest: v2::ToServerlessServer) -> Self {
+		ToServerlessServer::V2(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
-		if let ToServerlessServer::V1(data) = self {
+		if let ToServerlessServer::V2(data) = self {
 			Ok(data)
 		} else {
 			bail!("version not latest");
@@ -133,14 +429,459 @@ impl OwnedVersionedData for ToServerlessServer {
 
 	fn deserialize_version(payload: &[u8], version: u16) -> Result<Self> {
 		match version {
-			1 => Ok(ToServerlessServer::V1(serde_bare::from_slice(payload)?)),
+			1 | 2 => Ok(ToServerlessServer::V2(serde_bare::from_slice(payload)?)),
 			_ => bail!("invalid version: {version}"),
 		}
 	}
 
 	fn serialize_version(self, _version: u16) -> Result<Vec<u8>> {
 		match self {
-			ToServerlessServer::V1(data) => serde_bare::to_vec(&data).map_err(Into::into),
+			ToServerlessServer::V2(data) => serde_bare::to_vec(&data).map_err(Into::into),
+		}
+	}
+}
+
+// Helper conversion functions
+fn convert_to_client_tunnel_message_kind_v1_to_v2(
+	kind: v1::ToClientTunnelMessageKind,
+) -> v2::ToClientTunnelMessageKind {
+	match kind {
+		v1::ToClientTunnelMessageKind::TunnelAck => v2::ToClientTunnelMessageKind::TunnelAck,
+		v1::ToClientTunnelMessageKind::ToClientRequestStart(req) => {
+			v2::ToClientTunnelMessageKind::ToClientRequestStart(v2::ToClientRequestStart {
+				actor_id: req.actor_id,
+				method: req.method,
+				path: req.path,
+				headers: req.headers,
+				body: req.body,
+				stream: req.stream,
+			})
+		}
+		v1::ToClientTunnelMessageKind::ToClientRequestChunk(chunk) => {
+			v2::ToClientTunnelMessageKind::ToClientRequestChunk(v2::ToClientRequestChunk {
+				body: chunk.body,
+				finish: chunk.finish,
+			})
+		}
+		v1::ToClientTunnelMessageKind::ToClientRequestAbort => {
+			v2::ToClientTunnelMessageKind::ToClientRequestAbort
+		}
+		v1::ToClientTunnelMessageKind::ToClientWebSocketOpen(ws) => {
+			v2::ToClientTunnelMessageKind::ToClientWebSocketOpen(v2::ToClientWebSocketOpen {
+				actor_id: ws.actor_id,
+				path: ws.path,
+				headers: ws.headers,
+			})
+		}
+		v1::ToClientTunnelMessageKind::ToClientWebSocketMessage(msg) => {
+			v2::ToClientTunnelMessageKind::ToClientWebSocketMessage(v2::ToClientWebSocketMessage {
+				// Default to 0 for v1 messages (hibernation disabled by default)
+				index: 0,
+				data: msg.data,
+				binary: msg.binary,
+			})
+		}
+		v1::ToClientTunnelMessageKind::ToClientWebSocketClose(close) => {
+			v2::ToClientTunnelMessageKind::ToClientWebSocketClose(v2::ToClientWebSocketClose {
+				code: close.code,
+				reason: close.reason,
+			})
+		}
+	}
+}
+
+fn convert_to_client_tunnel_message_kind_v2_to_v1(
+	kind: v2::ToClientTunnelMessageKind,
+) -> Result<v1::ToClientTunnelMessageKind> {
+	Ok(match kind {
+		v2::ToClientTunnelMessageKind::TunnelAck => v1::ToClientTunnelMessageKind::TunnelAck,
+		v2::ToClientTunnelMessageKind::ToClientRequestStart(req) => {
+			v1::ToClientTunnelMessageKind::ToClientRequestStart(v1::ToClientRequestStart {
+				actor_id: req.actor_id,
+				method: req.method,
+				path: req.path,
+				headers: req.headers,
+				body: req.body,
+				stream: req.stream,
+			})
+		}
+		v2::ToClientTunnelMessageKind::ToClientRequestChunk(chunk) => {
+			v1::ToClientTunnelMessageKind::ToClientRequestChunk(v1::ToClientRequestChunk {
+				body: chunk.body,
+				finish: chunk.finish,
+			})
+		}
+		v2::ToClientTunnelMessageKind::ToClientRequestAbort => {
+			v1::ToClientTunnelMessageKind::ToClientRequestAbort
+		}
+		v2::ToClientTunnelMessageKind::ToClientWebSocketOpen(ws) => {
+			v1::ToClientTunnelMessageKind::ToClientWebSocketOpen(v1::ToClientWebSocketOpen {
+				actor_id: ws.actor_id,
+				path: ws.path,
+				headers: ws.headers,
+			})
+		}
+		v2::ToClientTunnelMessageKind::ToClientWebSocketMessage(msg) => {
+			v1::ToClientTunnelMessageKind::ToClientWebSocketMessage(v1::ToClientWebSocketMessage {
+				data: msg.data,
+				binary: msg.binary,
+			})
+		}
+		v2::ToClientTunnelMessageKind::ToClientWebSocketClose(close) => {
+			v1::ToClientTunnelMessageKind::ToClientWebSocketClose(v1::ToClientWebSocketClose {
+				code: close.code,
+				reason: close.reason,
+			})
+		}
+	})
+}
+
+fn convert_to_server_tunnel_message_kind_v1_to_v2(
+	kind: v1::ToServerTunnelMessageKind,
+) -> v2::ToServerTunnelMessageKind {
+	match kind {
+		v1::ToServerTunnelMessageKind::TunnelAck => v2::ToServerTunnelMessageKind::TunnelAck,
+		v1::ToServerTunnelMessageKind::ToServerResponseStart(resp) => {
+			v2::ToServerTunnelMessageKind::ToServerResponseStart(v2::ToServerResponseStart {
+				status: resp.status,
+				headers: resp.headers,
+				body: resp.body,
+				stream: resp.stream,
+			})
+		}
+		v1::ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => {
+			v2::ToServerTunnelMessageKind::ToServerResponseChunk(v2::ToServerResponseChunk {
+				body: chunk.body,
+				finish: chunk.finish,
+			})
+		}
+		v1::ToServerTunnelMessageKind::ToServerResponseAbort => {
+			v2::ToServerTunnelMessageKind::ToServerResponseAbort
+		}
+		v1::ToServerTunnelMessageKind::ToServerWebSocketOpen => {
+			v2::ToServerTunnelMessageKind::ToServerWebSocketOpen(v2::ToServerWebSocketOpen {
+				can_hibernate: false,
+				last_msg_index: -1,
+			})
+		}
+		v1::ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => {
+			v2::ToServerTunnelMessageKind::ToServerWebSocketMessage(v2::ToServerWebSocketMessage {
+				data: msg.data,
+				binary: msg.binary,
+			})
+		}
+		v1::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => {
+			v2::ToServerTunnelMessageKind::ToServerWebSocketClose(v2::ToServerWebSocketClose {
+				code: close.code,
+				reason: close.reason,
+				retry: false,
+			})
+		}
+	}
+}
+
+fn convert_to_server_tunnel_message_kind_v2_to_v1(
+	kind: v2::ToServerTunnelMessageKind,
+) -> Result<v1::ToServerTunnelMessageKind> {
+	Ok(match kind {
+		v2::ToServerTunnelMessageKind::TunnelAck => v1::ToServerTunnelMessageKind::TunnelAck,
+		v2::ToServerTunnelMessageKind::ToServerResponseStart(resp) => {
+			v1::ToServerTunnelMessageKind::ToServerResponseStart(v1::ToServerResponseStart {
+				status: resp.status,
+				headers: resp.headers,
+				body: resp.body,
+				stream: resp.stream,
+			})
+		}
+		v2::ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => {
+			v1::ToServerTunnelMessageKind::ToServerResponseChunk(v1::ToServerResponseChunk {
+				body: chunk.body,
+				finish: chunk.finish,
+			})
+		}
+		v2::ToServerTunnelMessageKind::ToServerResponseAbort => {
+			v1::ToServerTunnelMessageKind::ToServerResponseAbort
+		}
+		v2::ToServerTunnelMessageKind::ToServerWebSocketOpen(_) => {
+			v1::ToServerTunnelMessageKind::ToServerWebSocketOpen
+		}
+		v2::ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => {
+			v1::ToServerTunnelMessageKind::ToServerWebSocketMessage(v1::ToServerWebSocketMessage {
+				data: msg.data,
+				binary: msg.binary,
+			})
+		}
+		v2::ToServerTunnelMessageKind::ToServerWebSocketMessageAck(_) => {
+			// v1 doesn't have MessageAck, this is a v2-only feature
+			bail!("ToServerWebSocketMessageAck is not supported in v1");
 		}
+		v2::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => {
+			v1::ToServerTunnelMessageKind::ToServerWebSocketClose(v1::ToServerWebSocketClose {
+				code: close.code,
+				reason: close.reason,
+			})
+		}
+	})
+}
+
+fn convert_event_v1_to_v2(event: v1::Event) -> v2::Event {
+	match event {
+		v1::Event::EventActorIntent(intent) => v2::Event::EventActorIntent(v2::EventActorIntent {
+			actor_id: intent.actor_id,
+			generation: intent.generation,
+			intent: convert_actor_intent_v1_to_v2(intent.intent),
+		}),
+		v1::Event::EventActorStateUpdate(state) => {
+			v2::Event::EventActorStateUpdate(v2::EventActorStateUpdate {
+				actor_id: state.actor_id,
+				generation: state.generation,
+				state: convert_actor_state_v1_to_v2(state.state),
+			})
+		}
+		v1::Event::EventActorSetAlarm(alarm) => {
+			v2::Event::EventActorSetAlarm(v2::EventActorSetAlarm {
+				actor_id: alarm.actor_id,
+				generation: alarm.generation,
+				alarm_ts: alarm.alarm_ts,
+			})
+		}
+	}
+}
+
+fn convert_event_v2_to_v1(event: v2::Event) -> v1::Event {
+	match event {
+		v2::Event::EventActorIntent(intent) => v1::Event::EventActorIntent(v1::EventActorIntent {
+			actor_id: intent.actor_id,
+			generation: intent.generation,
+			intent: convert_actor_intent_v2_to_v1(intent.intent),
+		}),
+		v2::Event::EventActorStateUpdate(state) => {
+			v1::Event::EventActorStateUpdate(v1::EventActorStateUpdate {
+				actor_id: state.actor_id,
+				generation: state.generation,
+				state: convert_actor_state_v2_to_v1(state.state),
+			})
+		}
+		v2::Event::EventActorSetAlarm(alarm) => {
+			v1::Event::EventActorSetAlarm(v1::EventActorSetAlarm {
+				actor_id: alarm.actor_id,
+				generation: alarm.generation,
+				alarm_ts: alarm.alarm_ts,
+			})
+		}
+	}
+}
+
+fn convert_actor_intent_v1_to_v2(intent: v1::ActorIntent) -> v2::ActorIntent {
+	match intent {
+		v1::ActorIntent::ActorIntentSleep => v2::ActorIntent::ActorIntentSleep,
+		v1::ActorIntent::ActorIntentStop => v2::ActorIntent::ActorIntentStop,
+	}
+}
+
+fn convert_actor_intent_v2_to_v1(intent: v2::ActorIntent) -> v1::ActorIntent {
+	match intent {
+		v2::ActorIntent::ActorIntentSleep => v1::ActorIntent::ActorIntentSleep,
+		v2::ActorIntent::ActorIntentStop => v1::ActorIntent::ActorIntentStop,
+	}
+}
+
+fn convert_actor_state_v1_to_v2(state: v1::ActorState) -> v2::ActorState {
+	match state {
+		v1::ActorState::ActorStateRunning => v2::ActorState::ActorStateRunning,
+		v1::ActorState::ActorStateStopped(stopped) => {
+			v2::ActorState::ActorStateStopped(v2::ActorStateStopped {
+				code: convert_stop_code_v1_to_v2(stopped.code),
+				message: stopped.message,
+			})
+		}
+	}
+}
+
+fn convert_actor_state_v2_to_v1(state: v2::ActorState) -> v1::ActorState {
+	match state {
+		v2::ActorState::ActorStateRunning => v1::ActorState::ActorStateRunning,
+		v2::ActorState::ActorStateStopped(stopped) => {
+			v1::ActorState::ActorStateStopped(v1::ActorStateStopped {
+				code: convert_stop_code_v2_to_v1(stopped.code),
+				message: stopped.message,
+			})
+		}
+	}
+}
+
+fn convert_stop_code_v1_to_v2(code: v1::StopCode) -> v2::StopCode {
+	match code {
+		v1::StopCode::Ok => v2::StopCode::Ok,
+		v1::StopCode::Error => v2::StopCode::Error,
+	}
+}
+
+fn convert_stop_code_v2_to_v1(code: v2::StopCode) -> v1::StopCode {
+	match code {
+		v2::StopCode::Ok => v1::StopCode::Ok,
+		v2::StopCode::Error => v1::StopCode::Error,
+	}
+}
+
+fn convert_kv_request_data_v1_to_v2(data: v1::KvRequestData) -> v2::KvRequestData {
+	match data {
+		v1::KvRequestData::KvGetRequest(req) => {
+			v2::KvRequestData::KvGetRequest(v2::KvGetRequest { keys: req.keys })
+		}
+		v1::KvRequestData::KvListRequest(req) => {
+			v2::KvRequestData::KvListRequest(v2::KvListRequest {
+				query: convert_kv_list_query_v1_to_v2(req.query),
+				reverse: req.reverse,
+				limit: req.limit,
+			})
+		}
+		v1::KvRequestData::KvPutRequest(req) => v2::KvRequestData::KvPutRequest(v2::KvPutRequest {
+			keys: req.keys,
+			values: req.values,
+		}),
+		v1::KvRequestData::KvDeleteRequest(req) => {
+			v2::KvRequestData::KvDeleteRequest(v2::KvDeleteRequest { keys: req.keys })
+		}
+		v1::KvRequestData::KvDropRequest => v2::KvRequestData::KvDropRequest,
+	}
+}
+
+fn convert_kv_request_data_v2_to_v1(data: v2::KvRequestData) -> v1::KvRequestData {
+	match data {
+		v2::KvRequestData::KvGetRequest(req) => {
+			v1::KvRequestData::KvGetRequest(v1::KvGetRequest { keys: req.keys })
+		}
+		v2::KvRequestData::KvListRequest(req) => {
+			v1::KvRequestData::KvListRequest(v1::KvListRequest {
+				query: convert_kv_list_query_v2_to_v1(req.query),
+				reverse: req.reverse,
+				limit: req.limit,
+			})
+		}
+		v2::KvRequestData::KvPutRequest(req) => v1::KvRequestData::KvPutRequest(v1::KvPutRequest {
+			keys: req.keys,
+			values: req.values,
+		}),
+		v2::KvRequestData::KvDeleteRequest(req) => {
+			v1::KvRequestData::KvDeleteRequest(v1::KvDeleteRequest { keys: req.keys })
+		}
+		v2::KvRequestData::KvDropRequest => v1::KvRequestData::KvDropRequest,
+	}
+}
+
+fn convert_kv_response_data_v1_to_v2(data: v1::KvResponseData) -> v2::KvResponseData {
+	match data {
+		v1::KvResponseData::KvErrorResponse(err) => {
+			v2::KvResponseData::KvErrorResponse(v2::KvErrorResponse {
+				message: err.message,
+			})
+		}
+		v1::KvResponseData::KvGetResponse(resp) => {
+			v2::KvResponseData::KvGetResponse(v2::KvGetResponse {
+				keys: resp.keys,
+				values: resp.values,
+				metadata: resp
+					.metadata
+					.into_iter()
+					.map(convert_kv_metadata_v1_to_v2)
+					.collect(),
+			})
+		}
+		v1::KvResponseData::KvListResponse(resp) => {
+			v2::KvResponseData::KvListResponse(v2::KvListResponse {
+				keys: resp.keys,
+				values: resp.values,
+				metadata: resp
+					.metadata
+					.into_iter()
+					.map(convert_kv_metadata_v1_to_v2)
+					.collect(),
+			})
+		}
+		v1::KvResponseData::KvPutResponse => v2::KvResponseData::KvPutResponse,
+		v1::KvResponseData::KvDeleteResponse => v2::KvResponseData::KvDeleteResponse,
+		v1::KvResponseData::KvDropResponse => v2::KvResponseData::KvDropResponse,
+	}
+}
+
+fn convert_kv_response_data_v2_to_v1(data: v2::KvResponseData) -> v1::KvResponseData {
+	match data {
+		v2::KvResponseData::KvErrorResponse(err) => {
+			v1::KvResponseData::KvErrorResponse(v1::KvErrorResponse {
+				message: err.message,
+			})
+		}
+		v2::KvResponseData::KvGetResponse(resp) => {
+			v1::KvResponseData::KvGetResponse(v1::KvGetResponse {
+				keys: resp.keys,
+				values: resp.values,
+				metadata: resp
+					.metadata
+					.into_iter()
+					.map(convert_kv_metadata_v2_to_v1)
+					.collect(),
+			})
+		}
+		v2::KvResponseData::KvListResponse(resp) => {
+			v1::KvResponseData::KvListResponse(v1::KvListResponse {
+				keys: resp.keys,
+				values: resp.values,
+				metadata: resp
+					.metadata
+					.into_iter()
+					.map(convert_kv_metadata_v2_to_v1)
+					.collect(),
+			})
+		}
+		v2::KvResponseData::KvPutResponse => v1::KvResponseData::KvPutResponse,
+		v2::KvResponseData::KvDeleteResponse => v1::KvResponseData::KvDeleteResponse,
+		v2::KvResponseData::KvDropResponse => v1::KvResponseData::KvDropResponse,
+	}
+}
+
+fn convert_kv_list_query_v1_to_v2(query: v1::KvListQuery) -> v2::KvListQuery {
+	match query {
+		v1::KvListQuery::KvListAllQuery => v2::KvListQuery::KvListAllQuery,
+		v1::KvListQuery::KvListRangeQuery(range) => {
+			v2::KvListQuery::KvListRangeQuery(v2::KvListRangeQuery {
+				start: range.start,
+				end: range.end,
+				exclusive: range.exclusive,
+			})
+		}
+		v1::KvListQuery::KvListPrefixQuery(prefix) => {
+			v2::KvListQuery::KvListPrefixQuery(v2::KvListPrefixQuery { key: prefix.key })
+		}
+	}
+}
+
+fn convert_kv_list_query_v2_to_v1(query: v2::KvListQuery) -> v1::KvListQuery {
+	match query {
+		v2::KvListQuery::KvListAllQuery => v1::KvListQuery::KvListAllQuery,
+		v2::KvListQuery::KvListRangeQuery(range) => {
+			v1::KvListQuery::KvListRangeQuery(v1::KvListRangeQuery {
+				start: range.start,
+				end: range.end,
+				exclusive: range.exclusive,
+			})
+		}
+		v2::KvListQuery::KvListPrefixQuery(prefix) => {
+			v1::KvListQuery::KvListPrefixQuery(v1::KvListPrefixQuery { key: prefix.key })
+		}
+	}
+}
+
+fn convert_kv_metadata_v1_to_v2(metadata: v1::KvMetadata) -> v2::KvMetadata {
+	v2::KvMetadata {
+		version: metadata.version,
+		create_ts: metadata.create_ts,
+	}
+}
+
+fn convert_kv_metadata_v2_to_v1(metadata: v2::KvMetadata) -> v1::KvMetadata {
+	v1::KvMetadata {
+		version: metadata.version,
+		create_ts: metadata.create_ts,
 	}
 }
diff --git a/engine/sdks/rust/ups-protocol/src/versioned.rs b/engine/sdks/rust/ups-protocol/src/versioned.rs
index a4cb0fb07e..7f6eeae70c 100644
--- a/engine/sdks/rust/ups-protocol/src/versioned.rs
+++ b/engine/sdks/rust/ups-protocol/src/versioned.rs
@@ -10,11 +10,11 @@ pub enum UpsMessage {
 impl OwnedVersionedData for UpsMessage {
 	type Latest = v1::UpsMessage;
 
-	fn latest(latest: v1::UpsMessage) -> Self {
+	fn wrap_latest(latest: v1::UpsMessage) -> Self {
 		UpsMessage::V1(latest)
 	}
 
-	fn into_latest(self) -> Result<Self::Latest> {
+	fn unwrap_latest(self) -> Result<Self::Latest> {
 		#[allow(irrefutable_let_patterns)]
 		if let UpsMessage::V1(data) = self {
 			Ok(data)
diff --git a/engine/sdks/schemas/epoxy-protocol/v1.bare b/engine/sdks/schemas/epoxy-protocol/v1.bare
index 88558c1491..f3a0e04327 100644
--- a/engine/sdks/schemas/epoxy-protocol/v1.bare
+++ b/engine/sdks/schemas/epoxy-protocol/v1.bare
@@ -213,6 +213,11 @@ type KvGetResponse struct {
 	value: optional<data>
 }
 
+type KvPurgeRequest struct {
+	keys: list<data>
+}
+
+type KvPurgeResponse void
 
 # MARK: Request/Response
 type RequestKind union {
@@ -225,7 +230,8 @@ type RequestKind union {
 	HealthCheckRequest |
 	CoordinatorUpdateReplicaStatusRequest |
 	BeginLearningRequest |
-	KvGetRequest
+	KvGetRequest |
+	KvPurgeRequest
 }
 
 type Request struct {
@@ -244,7 +250,8 @@ type ResponseKind union {
 	HealthCheckResponse |
 	CoordinatorUpdateReplicaStatusResponse |
 	BeginLearningResponse |
-	KvGetResponse
+	KvGetResponse |
+	KvPurgeResponse
 }
 
 type Response struct {
diff --git a/engine/sdks/schemas/runner-protocol/v2.bare b/engine/sdks/schemas/runner-protocol/v2.bare
new file mode 100644
index 0000000000..5d2e499f44
--- /dev/null
+++ b/engine/sdks/schemas/runner-protocol/v2.bare
@@ -0,0 +1,403 @@
+# Runner Protocol v1
+
+# MARK: Core Primitives
+
+type Id str
+type Json str
+
+# MARK: KV
+
+# Basic types
+type KvKey data
+type KvValue data
+type KvMetadata struct {
+	version: data
+	createTs: i64
+}
+
+# Query types
+type KvListAllQuery void
+type KvListRangeQuery struct {
+	start: KvKey
+	end: KvKey
+	exclusive: bool
+}
+
+type KvListPrefixQuery struct {
+	key: KvKey
+}
+
+type KvListQuery union {
+	KvListAllQuery |
+	KvListRangeQuery |
+	KvListPrefixQuery
+}
+
+# Request types
+type KvGetRequest struct {
+	keys: list<KvKey>
+}
+
+type KvListRequest struct {
+	query: KvListQuery
+	reverse: optional<bool>
+	limit: optional<u64>
+}
+
+type KvPutRequest struct {
+	keys: list<KvKey>
+	values: list<KvValue>
+}
+
+type KvDeleteRequest struct {
+	keys: list<KvKey>
+}
+
+type KvDropRequest void
+
+# Response types
+type KvErrorResponse struct {
+	message: str
+}
+
+type KvGetResponse struct {
+	keys: list<KvKey>
+	values: list<KvValue>
+	metadata: list<KvMetadata>
+}
+
+type KvListResponse struct {
+	keys: list<KvKey>
+	values: list<KvValue>
+	metadata: list<KvMetadata>
+}
+
+type KvPutResponse void
+type KvDeleteResponse void
+type KvDropResponse void
+
+# Request/Response unions
+type KvRequestData union {
+	KvGetRequest |
+	KvListRequest |
+	KvPutRequest |
+	KvDeleteRequest |
+	KvDropRequest
+}
+
+type KvResponseData union {
+	KvErrorResponse |
+	KvGetResponse |
+	KvListResponse |
+	KvPutResponse |
+	KvDeleteResponse |
+	KvDropResponse
+}
+
+# MARK: Actor
+
+# Core
+type StopCode enum {
+	OK
+	ERROR
+}
+
+type ActorName struct {
+	metadata: Json
+}
+
+type ActorConfig struct {
+	name: str
+	key: optional<str>
+	createTs: i64
+	input: optional<data>
+}
+
+# Intent
+type ActorIntentSleep void
+
+type ActorIntentStop void
+
+type ActorIntent union {
+	ActorIntentSleep |
+	ActorIntentStop
+}
+
+# State
+type ActorStateRunning void
+
+type ActorStateStopped struct {
+	code: StopCode
+	message: optional<str>
+}
+
+type ActorState union {
+	ActorStateRunning |
+	ActorStateStopped
+}
+
+# MARK: Events
+type EventActorIntent struct {
+	actorId: Id
+	generation: u32
+	intent: ActorIntent
+}
+
+type EventActorStateUpdate struct {
+	actorId: Id
+	generation: u32
+	state: ActorState
+}
+
+type EventActorSetAlarm struct {
+	actorId: Id
+	generation: u32
+	alarmTs: optional<i64>
+}
+
+type Event union {
+	EventActorIntent |
+	EventActorStateUpdate |
+	EventActorSetAlarm
+}
+
+type EventWrapper struct {
+	index: i64
+	inner: Event
+}
+
+# MARK: Commands
+#
+type CommandStartActor struct {
+	actorId: Id
+	generation: u32
+	config: ActorConfig
+}
+
+type CommandStopActor struct {
+	actorId: Id
+	generation: u32
+}
+
+type Command union {
+	CommandStartActor |
+	CommandStopActor
+}
+
+type CommandWrapper struct {
+	index: i64
+	inner: Command
+}
+
+# MARK: Tunnel
+
+type RequestId data[16]  # UUIDv4
+type MessageId data[16]  # UUIDv4
+
+
+# Ack
+type TunnelAck void
+
+# HTTP
+type ToClientRequestStart struct {
+	actorId: Id
+	method: str
+	path: str
+	headers: map<str><str>
+	body: optional<data>
+	stream: bool
+}
+
+type ToClientRequestChunk struct {
+	body: data
+	finish: bool
+}
+
+type ToClientRequestAbort void
+
+type ToServerResponseStart struct {
+	status: u16
+	headers: map<str><str>
+	body: optional<data>
+	stream: bool
+}
+
+type ToServerResponseChunk struct {
+	body: data
+	finish: bool
+}
+
+type ToServerResponseAbort void
+
+# WebSocket
+type ToClientWebSocketOpen struct {
+	actorId: Id
+	path: str
+	headers: map<str><str>
+}
+
+type ToClientWebSocketMessage struct {
+	index: u16
+	data: data
+	binary: bool
+}
+
+type ToClientWebSocketClose struct {
+	code: optional<u16>
+	reason: optional<str>
+}
+
+type ToServerWebSocketOpen struct {
+	canHibernate: bool
+	lastMsgIndex: i64
+}
+
+type ToServerWebSocketMessage struct {
+	data: data
+	binary: bool
+}
+
+type ToServerWebSocketMessageAck struct {
+	index: u16
+}
+
+type ToServerWebSocketClose struct {
+	code: optional<u16>
+	reason: optional<str>
+	retry: bool
+}
+
+# To Server
+type ToServerTunnelMessageKind union {
+	TunnelAck |
+
+	# HTTP
+	ToServerResponseStart |
+	ToServerResponseChunk |
+	ToServerResponseAbort |
+	
+	# WebSocket
+	ToServerWebSocketOpen |
+	ToServerWebSocketMessage |
+	ToServerWebSocketMessageAck |
+	ToServerWebSocketClose
+}
+
+type ToServerTunnelMessage struct {
+	requestId: RequestId
+	messageId: MessageId
+	messageKind: ToServerTunnelMessageKind
+}
+
+# To Client
+type ToClientTunnelMessageKind union {
+	TunnelAck |
+
+	# HTTP
+	ToClientRequestStart |
+	ToClientRequestChunk |
+	ToClientRequestAbort |
+	
+	# WebSocket
+	ToClientWebSocketOpen |
+	ToClientWebSocketMessage |
+	ToClientWebSocketClose
+}
+
+type ToClientTunnelMessage struct {
+	requestId: RequestId
+	messageId: MessageId
+	messageKind: ToClientTunnelMessageKind
+
+	# Subject to send replies to.
+	#
+	# Only sent when opening a new request from gateway -> pegboard-runner-ws.
+	#
+	# Should be stripped before sending to the runner.
+	gatewayReplyTo: optional<str>
+}
+
+# MARK: To Server
+type ToServerInit struct {
+	name: str
+	version: u32
+	totalSlots: u32
+	lastCommandIdx: optional<i64>
+	prepopulateActorNames: optional<map<str><ActorName>>
+	metadata: optional<Json>
+}
+
+type ToServerEvents list<EventWrapper>
+
+type ToServerAckCommands struct {
+	lastCommandIdx: i64
+}
+
+type ToServerStopping void
+
+type ToServerPing struct {
+	ts: i64
+}
+
+type ToServerKvRequest struct {
+	actorId: Id
+	requestId: u32
+	data: KvRequestData
+}
+
+type ToServer union {
+	ToServerInit |
+	ToServerEvents |
+	ToServerAckCommands |
+	ToServerStopping |
+	ToServerPing |
+	ToServerKvRequest |
+	ToServerTunnelMessage
+}
+
+# MARK: To Client
+type ProtocolMetadata struct {
+	runnerLostThreshold: i64
+}
+
+type ToClientInit struct {
+	runnerId: Id
+	lastEventIdx: i64
+	metadata: ProtocolMetadata
+}
+
+type ToClientCommands list<CommandWrapper>
+
+type ToClientAckEvents struct {
+	lastEventIdx: i64
+}
+
+type ToClientKvResponse struct {
+	requestId: u32
+	data: KvResponseData
+}
+
+type ToClientClose void
+
+type ToClient union {
+	ToClientInit |
+	ToClientClose |
+	ToClientCommands |
+	ToClientAckEvents |
+	ToClientKvResponse |
+	ToClientTunnelMessage
+}
+
+# MARK: To Gateway
+type ToGateway struct {
+	message: ToServerTunnelMessage
+}
+
+# MARK: Serverless
+type ToServerlessServerInit struct {
+	runnerId: Id
+}
+
+type ToServerlessServer union {
+	ToServerlessServerInit
+}
diff --git a/engine/sdks/typescript/runner-protocol/src/index.ts b/engine/sdks/typescript/runner-protocol/src/index.ts
index 798b86a04d..c6405665cb 100644
--- a/engine/sdks/typescript/runner-protocol/src/index.ts
+++ b/engine/sdks/typescript/runner-protocol/src/index.ts
@@ -1063,18 +1063,21 @@ export function writeToClientWebSocketOpen(bc: bare.ByteCursor, x: ToClientWebSo
 }
 
 export type ToClientWebSocketMessage = {
+    readonly index: u16
     readonly data: ArrayBuffer
     readonly binary: boolean
 }
 
 export function readToClientWebSocketMessage(bc: bare.ByteCursor): ToClientWebSocketMessage {
     return {
+        index: bare.readU16(bc),
         data: bare.readData(bc),
         binary: bare.readBool(bc),
     }
 }
 
 export function writeToClientWebSocketMessage(bc: bare.ByteCursor, x: ToClientWebSocketMessage): void {
+    bare.writeU16(bc, x.index)
     bare.writeData(bc, x.data)
     bare.writeBool(bc, x.binary)
 }
@@ -1107,7 +1110,22 @@ export function writeToClientWebSocketClose(bc: bare.ByteCursor, x: ToClientWebS
     write5(bc, x.reason)
 }
 
-export type ToServerWebSocketOpen = null
+export type ToServerWebSocketOpen = {
+    readonly canHibernate: boolean
+    readonly lastMsgIndex: i64
+}
+
+export function readToServerWebSocketOpen(bc: bare.ByteCursor): ToServerWebSocketOpen {
+    return {
+        canHibernate: bare.readBool(bc),
+        lastMsgIndex: bare.readI64(bc),
+    }
+}
+
+export function writeToServerWebSocketOpen(bc: bare.ByteCursor, x: ToServerWebSocketOpen): void {
+    bare.writeBool(bc, x.canHibernate)
+    bare.writeI64(bc, x.lastMsgIndex)
+}
 
 export type ToServerWebSocketMessage = {
     readonly data: ArrayBuffer
@@ -1126,21 +1144,38 @@ export function writeToServerWebSocketMessage(bc: bare.ByteCursor, x: ToServerWe
     bare.writeBool(bc, x.binary)
 }
 
+export type ToServerWebSocketMessageAck = {
+    readonly index: u16
+}
+
+export function readToServerWebSocketMessageAck(bc: bare.ByteCursor): ToServerWebSocketMessageAck {
+    return {
+        index: bare.readU16(bc),
+    }
+}
+
+export function writeToServerWebSocketMessageAck(bc: bare.ByteCursor, x: ToServerWebSocketMessageAck): void {
+    bare.writeU16(bc, x.index)
+}
+
 export type ToServerWebSocketClose = {
     readonly code: u16 | null
     readonly reason: string | null
+    readonly retry: boolean
 }
 
 export function readToServerWebSocketClose(bc: bare.ByteCursor): ToServerWebSocketClose {
     return {
         code: read9(bc),
         reason: read5(bc),
+        retry: bare.readBool(bc),
     }
 }
 
 export function writeToServerWebSocketClose(bc: bare.ByteCursor, x: ToServerWebSocketClose): void {
     write9(bc, x.code)
     write5(bc, x.reason)
+    bare.writeBool(bc, x.retry)
 }
 
 /**
@@ -1159,6 +1194,7 @@ export type ToServerTunnelMessageKind =
      */
     | { readonly tag: "ToServerWebSocketOpen"; readonly val: ToServerWebSocketOpen }
     | { readonly tag: "ToServerWebSocketMessage"; readonly val: ToServerWebSocketMessage }
+    | { readonly tag: "ToServerWebSocketMessageAck"; readonly val: ToServerWebSocketMessageAck }
     | { readonly tag: "ToServerWebSocketClose"; readonly val: ToServerWebSocketClose }
 
 export function readToServerTunnelMessageKind(bc: bare.ByteCursor): ToServerTunnelMessageKind {
@@ -1174,10 +1210,12 @@ export function readToServerTunnelMessageKind(bc: bare.ByteCursor): ToServerTunn
         case 3:
             return { tag: "ToServerResponseAbort", val: null }
         case 4:
-            return { tag: "ToServerWebSocketOpen", val: null }
+            return { tag: "ToServerWebSocketOpen", val: readToServerWebSocketOpen(bc) }
         case 5:
             return { tag: "ToServerWebSocketMessage", val: readToServerWebSocketMessage(bc) }
         case 6:
+            return { tag: "ToServerWebSocketMessageAck", val: readToServerWebSocketMessageAck(bc) }
+        case 7:
             return { tag: "ToServerWebSocketClose", val: readToServerWebSocketClose(bc) }
         default: {
             bc.offset = offset
@@ -1208,6 +1246,7 @@ export function writeToServerTunnelMessageKind(bc: bare.ByteCursor, x: ToServerT
         }
         case "ToServerWebSocketOpen": {
             bare.writeU8(bc, 4)
+            writeToServerWebSocketOpen(bc, x.val)
             break
         }
         case "ToServerWebSocketMessage": {
@@ -1215,8 +1254,13 @@ export function writeToServerTunnelMessageKind(bc: bare.ByteCursor, x: ToServerT
             writeToServerWebSocketMessage(bc, x.val)
             break
         }
-        case "ToServerWebSocketClose": {
+        case "ToServerWebSocketMessageAck": {
             bare.writeU8(bc, 6)
+            writeToServerWebSocketMessageAck(bc, x.val)
+            break
+        }
+        case "ToServerWebSocketClose": {
+            bare.writeU8(bc, 7)
             writeToServerWebSocketClose(bc, x.val)
             break
         }
diff --git a/engine/sdks/typescript/runner/src/mod.ts b/engine/sdks/typescript/runner/src/mod.ts
index 8f9d2ff2c8..4603cc35f7 100644
--- a/engine/sdks/typescript/runner/src/mod.ts
+++ b/engine/sdks/typescript/runner/src/mod.ts
@@ -8,7 +8,7 @@ import { importWebSocket } from "./websocket.js";
 import type { WebSocketTunnelAdapter } from "./websocket-tunnel-adapter";
 
 const KV_EXPIRE: number = 30_000;
-const PROTOCOL_VERSION: number = 1;
+const PROTOCOL_VERSION: number = 2;
 
 /** Warn once the backlog significantly exceeds the server's ack batch size. */
 const EVENT_BACKLOG_WARN_THRESHOLD = 10_000;
@@ -62,9 +62,15 @@ export interface RunnerConfig {
 		config: ActorConfig,
 	) => Promise<void>;
 	onActorStop: (actorId: string, generation: number) => Promise<void>;
+	getActorHibernationConfig: (actorId: string, requestId: ArrayBuffer) => HibernationConfig;
 	noAutoShutdown?: boolean;
 }
 
+export interface HibernationConfig {
+	enabled: boolean;
+	lastMsgIndex: number | undefined;
+}
+
 export interface KvListOptions {
 	reverse?: boolean;
 	limit?: number;
@@ -155,9 +161,6 @@ export class Runner {
 		const actor = this.#removeActor(actorId, generation);
 		if (!actor) return;
 
-		// Unregister actor from tunnel
-		this.#tunnel?.unregisterActor(actor);
-
 		// If onActorStop times out, Pegboard will handle this timeout with ACTOR_STOP_THRESHOLD_DURATION_MS
 		try {
 			await this.#config.onActorStop(actorId, actor.generation);
@@ -246,23 +249,8 @@ export class Runner {
 
 		this.#actors.delete(actorId);
 
-		// Close all WebSocket connections for this actor
-		const actorWebSockets = this.#actorWebSockets.get(actorId);
-		if (actorWebSockets) {
-			for (const ws of actorWebSockets) {
-				try {
-					ws.close(1000, "Actor stopped");
-				} catch (err) {
-					logger()?.error({
-						msg: "error closing websocket for actor",
-						runnerId: this.runnerId,
-						actorId,
-						err,
-					});
-				}
-			}
-			this.#actorWebSockets.delete(actorId);
-		}
+		// Unregister actor from tunnel
+		this.#tunnel?.unregisterActor(actor);
 
 		return actor;
 	}
@@ -1390,6 +1378,10 @@ export class Runner {
 		}
 	}
 
+	sendWebsocketMessageAck(requestId: ArrayBuffer, index: number) {
+		this.#tunnel?.__ackWebsocketMessage(requestId, index);
+	}
+
 	getServerlessInitPacket(): string | undefined {
 		if (!this.runnerId) return undefined;
 
diff --git a/engine/sdks/typescript/runner/src/tunnel.ts b/engine/sdks/typescript/runner/src/tunnel.ts
index 3e9dfc24e2..9882341bc1 100644
--- a/engine/sdks/typescript/runner/src/tunnel.ts
+++ b/engine/sdks/typescript/runner/src/tunnel.ts
@@ -1,6 +1,6 @@
 import type * as protocol from "@rivetkit/engine-runner-protocol";
 import type { MessageId, RequestId } from "@rivetkit/engine-runner-protocol";
-import { v4 as uuidv4 } from "uuid";
+import { v4 as uuidv4, stringify as uuidstringify } from "uuid";
 import { logger } from "./log";
 import type { ActorInstance, Runner } from "./mod";
 import { unreachable } from "./utils";
@@ -8,6 +8,7 @@ import { WebSocketTunnelAdapter } from "./websocket-tunnel-adapter";
 
 const GC_INTERVAL = 60000; // 60 seconds
 const MESSAGE_ACK_TIMEOUT = 5000; // 5 seconds
+const WEBSOCKET_STATE_PERSIST_TIMEOUT = 30000; // 30 seconds
 
 interface PendingRequest {
 	resolve: (response: Response) => void;
@@ -56,7 +57,7 @@ export class Tunnel {
 
 		// Close all WebSockets
 		for (const [_, ws] of this.#actorWebSockets) {
-			ws.close();
+			ws.__closeWithRetry();
 		}
 		this.#actorWebSockets.clear();
 	}
@@ -108,6 +109,12 @@ export class Tunnel {
 			},
 		};
 
+		logger()?.debug({
+			msg: "ack tunnel msg",
+			requestId: uuidstringify(new Uint8Array(requestId)),
+			messageId: uuidstringify(new Uint8Array(messageId)),
+		});
+
 		this.#runner.__sendToServer(message);
 	}
 
@@ -156,7 +163,7 @@ export class Tunnel {
 				const webSocket = this.#actorWebSockets.get(requestIdStr);
 				if (webSocket) {
 					// Close the WebSocket connection
-					webSocket.close(1000, "Message acknowledgment timeout");
+					webSocket.__closeWithRetry(1000, "Message acknowledgment timeout");
 
 					// Clean up from actorWebSockets map
 					this.#actorWebSockets.delete(requestIdStr);
@@ -189,11 +196,11 @@ export class Tunnel {
 		}
 		actor.requests.clear();
 
-		// Close all WebSockets for this actor
+		// Flush acks and close all WebSockets for this actor
 		for (const webSocketId of actor.webSockets) {
 			const ws = this.#actorWebSockets.get(webSocketId);
 			if (ws) {
-				ws.close(1000, "Actor stopped");
+				ws.__closeWithRetry(1000, "Actor stopped");
 				this.#actorWebSockets.delete(webSocketId);
 			}
 		}
@@ -224,6 +231,13 @@ export class Tunnel {
 	}
 
 	async handleTunnelMessage(message: protocol.ToClientTunnelMessage) {
+		logger()?.debug({
+			msg: "tunnel msg",
+			requestId: uuidstringify(new Uint8Array(message.requestId)),
+			messageId: uuidstringify(new Uint8Array(message.messageId)),
+			message: message.messageKind,
+		});
+
 		if (message.messageKind.tag === "TunnelAck") {
 			// Mark pending message as acknowledged and remove it
 			const msgIdStr = bufferToString(message.messageId);
@@ -232,36 +246,47 @@ export class Tunnel {
 				this.#pendingTunnelMessages.delete(msgIdStr);
 			}
 		} else {
-			this.#sendAck(message.requestId, message.messageId);
 			switch (message.messageKind.tag) {
 				case "ToClientRequestStart":
+					this.#sendAck(message.requestId, message.messageId);
+
 					await this.#handleRequestStart(
 						message.requestId,
 						message.messageKind.val,
 					);
 					break;
 				case "ToClientRequestChunk":
+					this.#sendAck(message.requestId, message.messageId);
+
 					await this.#handleRequestChunk(
 						message.requestId,
 						message.messageKind.val,
 					);
 					break;
 				case "ToClientRequestAbort":
+					this.#sendAck(message.requestId, message.messageId);
+
 					await this.#handleRequestAbort(message.requestId);
 					break;
 				case "ToClientWebSocketOpen":
+					this.#sendAck(message.requestId, message.messageId);
+
 					await this.#handleWebSocketOpen(
 						message.requestId,
 						message.messageKind.val,
 					);
 					break;
 				case "ToClientWebSocketMessage":
-					await this.#handleWebSocketMessage(
+					this.#sendAck(message.requestId, message.messageId);
+
+					let _unhandled = await this.#handleWebSocketMessage(
 						message.requestId,
 						message.messageKind.val,
 					);
 					break;
 				case "ToClientWebSocketClose":
+					this.#sendAck(message.requestId, message.messageId);
+
 					await this.#handleWebSocketClose(
 						message.requestId,
 						message.messageKind.val,
@@ -311,8 +336,8 @@ export class Tunnel {
 							existing.actorId = req.actorId;
 						} else {
 							this.#actorPendingRequests.set(requestIdStr, {
-								resolve: () => {},
-								reject: () => {},
+								resolve: () => { },
+								reject: () => { },
 								streamController: controller,
 								actorId: req.actorId,
 							});
@@ -443,6 +468,7 @@ export class Tunnel {
 				val: {
 					code: 1011,
 					reason: "Actor not found",
+					retry: false,
 				},
 			});
 			return;
@@ -460,6 +486,7 @@ export class Tunnel {
 				val: {
 					code: 1011,
 					reason: "Not Implemented",
+					retry: false,
 				},
 			});
 			return;
@@ -479,7 +506,7 @@ export class Tunnel {
 					const dataBuffer =
 						typeof data === "string"
 							? (new TextEncoder().encode(data)
-									.buffer as ArrayBuffer)
+								.buffer as ArrayBuffer)
 							: data;
 
 					this.#sendMessage(requestId, {
@@ -490,13 +517,14 @@ export class Tunnel {
 						},
 					});
 				},
-				(code?: number, reason?: string) => {
+				(code?: number, reason?: string, retry: boolean = false) => {
 					// Send close through tunnel
 					this.#sendMessage(requestId, {
 						tag: "ToServerWebSocketClose",
 						val: {
 							code: code || null,
 							reason: reason || null,
+							retry,
 						},
 					});
 
@@ -514,13 +542,17 @@ export class Tunnel {
 			this.#actorWebSockets.set(webSocketId, adapter);
 
 			// Send open confirmation
+			let hibernationConfig = this.#runner.config.getActorHibernationConfig(actor.actorId, requestId);
 			this.#sendMessage(requestId, {
 				tag: "ToServerWebSocketOpen",
-				val: null,
+				val: {
+					canHibernate: hibernationConfig.enabled,
+					lastMsgIndex: BigInt(hibernationConfig.lastMsgIndex ?? -1),
+				},
 			});
 
 			// Notify adapter that connection is open
-			adapter._handleOpen();
+			adapter._handleOpen(requestId);
 
 			// Create a minimal request object for the websocket handler
 			// Include original headers from the open message
@@ -557,6 +589,7 @@ export class Tunnel {
 				val: {
 					code: 1011,
 					reason: "Server Error",
+					retry: false,
 				},
 			});
 
@@ -569,10 +602,11 @@ export class Tunnel {
 		}
 	}
 
+	/// Returns false if the message was sent off
 	async #handleWebSocketMessage(
 		requestId: ArrayBuffer,
-		msg: protocol.ToServerWebSocketMessage,
-	) {
+		msg: protocol.ToClientWebSocketMessage,
+	): Promise<boolean> {
 		const webSocketId = bufferToString(requestId);
 		const adapter = this.#actorWebSockets.get(webSocketId);
 		if (adapter) {
@@ -580,18 +614,39 @@ export class Tunnel {
 				? new Uint8Array(msg.data)
 				: new TextDecoder().decode(new Uint8Array(msg.data));
 
-			adapter._handleMessage(data, msg.binary);
+			return adapter._handleMessage(requestId, data, msg.index, msg.binary);
+		} else {
+			return true;
 		}
 	}
 
+	__ackWebsocketMessage(requestId: ArrayBuffer, index: number) {
+		logger()?.debug({
+			msg: "ack ws msg",
+			requestId: uuidstringify(new Uint8Array(requestId)),
+			index,
+		});
+
+		if (index < 0 || index > 65535) throw new Error("invalid websocket ack index");
+
+		// Send the ack message
+		this.#sendMessage(requestId, {
+			tag: "ToServerWebSocketMessageAck",
+			val: {
+				index,
+			},
+		});
+	}
+
 	async #handleWebSocketClose(
 		requestId: ArrayBuffer,
-		close: protocol.ToServerWebSocketClose,
+		close: protocol.ToClientWebSocketClose,
 	) {
 		const webSocketId = bufferToString(requestId);
 		const adapter = this.#actorWebSockets.get(webSocketId);
 		if (adapter) {
 			adapter._handleClose(
+				requestId,
 				close.code || undefined,
 				close.reason || undefined,
 			);
diff --git a/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts b/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts
index eb46758d94..2fd46085b2 100644
--- a/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts
+++ b/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts
@@ -17,7 +17,7 @@ export class WebSocketTunnelAdapter {
 	#protocol = "";
 	#url = "";
 	#sendCallback: (data: ArrayBuffer | string, isBinary: boolean) => void;
-	#closeCallback: (code?: number, reason?: string) => void;
+	#closeCallback: (code?: number, reason?: string, retry?: boolean) => void;
 
 	// Event buffering for events fired before listeners are attached
 	#bufferedEvents: Array<{
@@ -28,7 +28,7 @@ export class WebSocketTunnelAdapter {
 	constructor(
 		webSocketId: string,
 		sendCallback: (data: ArrayBuffer | string, isBinary: boolean) => void,
-		closeCallback: (code?: number, reason?: string) => void,
+		closeCallback: (code?: number, reason?: string, retry?: boolean) => void,
 	) {
 		this.#webSocketId = webSocketId;
 		this.#sendCallback = sendCallback;
@@ -186,6 +186,14 @@ export class WebSocketTunnelAdapter {
 	}
 
 	close(code?: number, reason?: string): void {
+		this.closeInner(code, reason);
+	}
+
+	__closeWithRetry(code?: number, reason?: string): void {
+		this.closeInner(code, reason, true);
+	}
+
+	closeInner(code?: number, reason?: string, retry: boolean = false): void {
 		if (
 			this.#readyState === 2 || // CLOSING
 			this.#readyState === 3 // CLOSED
@@ -196,7 +204,7 @@ export class WebSocketTunnelAdapter {
 		this.#readyState = 2; // CLOSING
 
 		// Send close through tunnel
-		this.#closeCallback(code, reason);
+		this.#closeCallback(code, reason, retry);
 
 		// Update state and fire event
 		this.#readyState = 3; // CLOSED
@@ -410,7 +418,7 @@ export class WebSocketTunnelAdapter {
 	}
 
 	// Internal methods called by the Tunnel class
-	_handleOpen(): void {
+	_handleOpen(requestId: ArrayBuffer): void {
 		if (this.#readyState !== 0) {
 			// CONNECTING
 			return;
@@ -420,16 +428,18 @@ export class WebSocketTunnelAdapter {
 
 		const event = {
 			type: "open",
+			rivetRequestId: requestId,
 			target: this,
 		};
 
 		this.#fireEvent("open", event);
 	}
 
-	_handleMessage(data: string | Uint8Array, isBinary: boolean): void {
+	/// Returns false if the message was sent off.
+	_handleMessage(requestId: ArrayBuffer, data: string | Uint8Array, index: number, isBinary: boolean): boolean {
 		if (this.#readyState !== 1) {
 			// OPEN
-			return;
+			return true;
 		}
 
 		let messageData: any;
@@ -460,15 +470,19 @@ export class WebSocketTunnelAdapter {
 		}
 
 		const event = {
-			data: messageData,
 			type: "message",
+			data: messageData,
+			rivetRequestId: requestId,
+			rivetMessageIndex: index,
 			target: this,
 		};
 
 		this.#fireEvent("message", event);
+
+		return false;
 	}
 
-	_handleClose(code?: number, reason?: string): void {
+	_handleClose(requestId: ArrayBuffer, code?: number, reason?: string): void {
 		if (this.#readyState === 3) {
 			// CLOSED
 			return;
@@ -477,10 +491,11 @@ export class WebSocketTunnelAdapter {
 		this.#readyState = 3; // CLOSED
 
 		const event = {
+			type: "close",
 			wasClean: true,
 			code: code || 1000,
 			reason: reason || "",
-			type: "close",
+			rivetRequestId: requestId,
 			target: this,
 		};
 
diff --git a/engine/sdks/typescript/test-runner/src/index.ts b/engine/sdks/typescript/test-runner/src/index.ts
index 50aa20c899..1d9815abc3 100644
--- a/engine/sdks/typescript/test-runner/src/index.ts
+++ b/engine/sdks/typescript/test-runner/src/index.ts
@@ -13,7 +13,7 @@ const INTERNAL_SERVER_PORT = process.env.INTERNAL_SERVER_PORT
 const RIVET_NAMESPACE = process.env.RIVET_NAMESPACE ?? "default";
 const RIVET_RUNNER_NAME = process.env.RIVET_RUNNER_NAME ?? "test-runner";
 const RIVET_RUNNER_KEY =
-	process.env.RIVET_RUNNER_KEY ?? `key-${Math.floor(Math.random() * 10000)}`;
+	process.env.RIVET_RUNNER_KEY;
 const RIVET_RUNNER_VERSION = process.env.RIVET_RUNNER_VERSION
 	? Number(process.env.RIVET_RUNNER_VERSION)
 	: 1;
@@ -28,7 +28,7 @@ const AUTOSTART_RUNNER = process.env.NO_AUTOSTART_RUNNER === undefined;
 let runnerStarted = Promise.withResolvers();
 let runnerStopped = Promise.withResolvers();
 let runner: Runner | null = null;
-const actorWebSockets = new Map<string, WebSocket>();
+const websocketLastMsgIndexes: Map<string, number> = new Map();
 
 // Create internal server
 const app = new Hono();
@@ -94,8 +94,6 @@ app.get("/start", async (c) => {
 	});
 });
 
-await autoConfigureServerless();
-
 if (AUTOSTART_SERVER) {
 	serve({
 		fetch: app.fetch,
@@ -106,8 +104,10 @@ if (AUTOSTART_SERVER) {
 	);
 }
 
-if (AUTOSTART_RUNNER)
+if (AUTOSTART_RUNNER) {
 	[runner, runnerStarted, runnerStopped] = await startRunner();
+}
+else await autoConfigureServerless();
 
 async function autoConfigureServerless() {
 	const res = await fetch(
@@ -155,13 +155,13 @@ async function startRunner(): Promise<
 		token: RIVET_TOKEN,
 		namespace: RIVET_NAMESPACE,
 		runnerName: RIVET_RUNNER_NAME,
-		runnerKey: RIVET_RUNNER_KEY,
+		runnerKey: RIVET_RUNNER_KEY ?? `key-${Math.floor(Math.random() * 10000)}`,
 		totalSlots: RIVET_RUNNER_TOTAL_SLOTS,
 		prepopulateActorNames: {},
 		onConnected: () => {
 			runnerStarted.resolve(undefined);
 		},
-		onDisconnected: () => {},
+		onDisconnected: () => { },
 		onShutdown: () => {
 			runnerStopped.resolve(undefined);
 		},
@@ -208,13 +208,12 @@ async function startRunner(): Promise<
 			);
 		},
 		websocket: async (
-			_runner: Runner,
+			runner: Runner,
 			actorId: string,
 			ws: WebSocket,
 			request: Request,
 		) => {
 			getLogger().info(`WebSocket connected for actor ${actorId}`);
-			actorWebSockets.set(actorId, ws);
 
 			// Echo server - send back any messages received
 			ws.addEventListener("message", (event) => {
@@ -222,13 +221,19 @@ async function startRunner(): Promise<
 				getLogger().info({
 					msg: `WebSocket message from actor ${actorId}`,
 					data,
+					index: (event as any).rivetMessageIndex,
 				});
+
 				ws.send(`Echo: ${data}`);
+
+				// Ack
+				const websocketId = Buffer.from((event as any).rivetRequestId).toString("base64");
+				websocketLastMsgIndexes.set(websocketId, (event as any).rivetMessageIndex);
+				runner.sendWebsocketMessageAck((event as any).rivetRequestId, (event as any).rivetMessageIndex);
 			});
 
 			ws.addEventListener("close", () => {
 				getLogger().info(`WebSocket closed for actor ${actorId}`);
-				actorWebSockets.delete(actorId);
 			});
 
 			ws.addEventListener("error", (error) => {
@@ -238,6 +243,13 @@ async function startRunner(): Promise<
 				});
 			});
 		},
+		getActorHibernationConfig(actorId, requestId) {
+			const websocketId = Buffer.from(requestId).toString("base64");
+			return {
+				enabled: true,
+				lastMsgIndex: websocketLastMsgIndexes.get(websocketId),
+			};
+		},
 	};
 
 	const runner = new Runner(config);
diff --git a/scripts/tests/actor_sleep.ts b/scripts/tests/actor_sleep.ts
index e39765b47a..8c34a68626 100755
--- a/scripts/tests/actor_sleep.ts
+++ b/scripts/tests/actor_sleep.ts
@@ -13,10 +13,12 @@ async function main() {
 
 		// Create an actor
 		console.log("Creating actor...");
-		const actorResponse = await getOrCreateActor(RIVET_NAMESPACE, "test-runner", "key");
+		const actorResponse = await getOrCreateActor(RIVET_NAMESPACE, "test-runner", "key3");
 		console.log("Actor created:", actorResponse.actor);
 
 		for (let i = 0; i < 10; i++) {
+			await testWebSocket(actorResponse.actor.actor_id);
+
 			console.log("Sleeping actor...");
 			const actorSleepResponse = await fetch(`${RIVET_ENDPOINT}/sleep`, {
 				method: "GET",
@@ -38,7 +40,6 @@ async function main() {
 			// await new Promise(resolve => setTimeout(resolve, 2000));
 		}
 
-
 		// Make a request to the actor
 		console.log("Making request to actor...");
 		const actorPingResponse = await fetch(`${RIVET_ENDPOINT}/ping`, {
@@ -59,8 +60,6 @@ async function main() {
 		}
 
 		console.log("Actor ping response:", pingResult);
-
-		// await testWebSocket(actorResponse.actor.actor_id);
 	} catch (error) {
 		console.error(`Actor test failed:`, error);
 	}
@@ -89,14 +88,6 @@ function testWebSocket(actorId: string): Promise<void> {
 
 		let pingReceived = false;
 		let echoReceived = false;
-		const timeout = setTimeout(() => {
-			console.log(
-				"No response received within timeout, but connection was established",
-			);
-			// Connection was established, that's enough for the test
-			ws.close();
-			resolve();
-		}, 2000);
 
 		ws.addEventListener("open", () => {
 			console.log("WebSocket connected");
@@ -126,21 +117,18 @@ function testWebSocket(actorId: string): Promise<void> {
 				console.log("Echo test successful!");
 
 				// All tests passed
-				clearTimeout(timeout);
 				ws.close();
 				resolve();
 			}
 		});
 
-		ws.addEventListener("error", (error) => {
-			clearTimeout(timeout);
-			reject(new Error(`WebSocket error: ${error.message}`));
+		ws.addEventListener("error", (event) => {
+			reject(new Error(`WebSocket error: ${event}`));
 		});
 
-		ws.addEventListener("close", () => {
-			clearTimeout(timeout);
+		ws.addEventListener("close", event => {
 			if (!pingReceived || !echoReceived) {
-				reject(new Error("WebSocket closed before completing tests"));
+				reject(new Error(`WebSocket closed before completing tests: ${event.code} (${event.reason}) ${new Date().toISOString()}`));
 			}
 		});
 	});