diff --git a/.gitattributes b/.gitattributes index e465856f93..4525642f10 100644 --- a/.gitattributes +++ b/.gitattributes @@ -12,6 +12,8 @@ # Prevent from counting in the language statistics engine/artifacts/** linguist-generated=true engine/sdks/** linguist-generated=true +engine/sdks/typescript/runner/** linguist-generated=false +engine/sdks/typescript/test-runner/** linguist-generated=false engine/sdks/schema/** linguist-generated=false website/public/llms.txt linguist-generated=true diff --git a/Cargo.lock b/Cargo.lock index c3fdd146ac..ac09eba5ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -742,7 +742,7 @@ dependencies = [ [[package]] name = "clickhouse-inserter" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "async-channel", @@ -766,7 +766,7 @@ dependencies = [ [[package]] name = "clickhouse-user-query" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "clickhouse", "serde", @@ -1369,7 +1369,7 @@ checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" [[package]] name = "epoxy" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "axum 0.8.4", @@ -1408,7 +1408,7 @@ dependencies = [ [[package]] name = "epoxy-protocol" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "rivet-util", @@ -1666,7 +1666,7 @@ dependencies = [ [[package]] name = "gasoline" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "async-stream", @@ -1713,7 +1713,7 @@ dependencies = [ [[package]] name = "gasoline-macros" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "proc-macro2", "quote", @@ -2341,7 +2341,7 @@ dependencies = [ [[package]] name = "internal" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "gasoline", @@ -2739,7 +2739,7 @@ dependencies = [ [[package]] name = "namespace" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "epoxy", @@ -3211,7 +3211,7 @@ checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" [[package]] name = "pegboard" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "base64 0.22.1", @@ -3240,7 +3240,7 @@ dependencies = [ [[package]] name = "pegboard-actor-kv" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "futures-util", @@ -3259,7 +3259,7 @@ dependencies = [ [[package]] name = "pegboard-gateway" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "async-trait", @@ -3275,7 +3275,9 @@ dependencies = [ "rivet-guard-core", "rivet-runner-protocol", "rivet-util", + "scc", "serde", + "serde_json", "thiserror 1.0.69", "tokio", "tokio-tungstenite", @@ -3286,7 +3288,7 @@ dependencies = [ [[package]] name = "pegboard-runner" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "async-trait", @@ -3319,7 +3321,7 @@ dependencies = [ [[package]] name = "pegboard-serverless" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "base64 0.22.1", @@ -4018,7 +4020,7 @@ dependencies = [ [[package]] name = "rivet-api-builder" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "axum 0.8.4", @@ -4062,7 +4064,7 @@ dependencies = [ [[package]] name = "rivet-api-peer" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "axum 0.8.4", @@ -4091,7 +4093,7 @@ dependencies = [ [[package]] name = "rivet-api-public" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "axum 0.8.4", @@ -4123,7 +4125,7 @@ dependencies = [ [[package]] name = "rivet-api-types" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "gasoline", @@ -4138,7 +4140,7 @@ dependencies = [ [[package]] name = "rivet-api-util" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "axum 0.8.4", @@ -4158,7 +4160,7 @@ dependencies = [ [[package]] name = "rivet-bootstrap" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "epoxy", "gasoline", @@ -4174,7 +4176,7 @@ dependencies = [ [[package]] name = "rivet-cache" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "futures-util", @@ -4215,14 +4217,14 @@ dependencies = [ [[package]] name = "rivet-cache-result" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "rivet-util", ] [[package]] name = "rivet-config" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "config", @@ -4240,7 +4242,7 @@ dependencies = [ [[package]] name = "rivet-data" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "gasoline", @@ -4254,7 +4256,7 @@ dependencies = [ [[package]] name = "rivet-dump-openapi" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "rivet-api-public", "serde_json", @@ -4263,7 +4265,7 @@ dependencies = [ [[package]] name = "rivet-engine" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "axum 0.8.4", @@ -4323,7 +4325,7 @@ dependencies = [ [[package]] name = "rivet-env" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "lazy_static", @@ -4333,7 +4335,7 @@ dependencies = [ [[package]] name = "rivet-error" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "indoc", @@ -4345,7 +4347,7 @@ dependencies = [ [[package]] name = "rivet-error-macros" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "indoc", "proc-macro2", @@ -4356,7 +4358,7 @@ dependencies = [ [[package]] name = "rivet-guard" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "axum 0.8.4", @@ -4389,6 +4391,7 @@ dependencies = [ "serde", "serde_json", "tokio", + "tokio-tungstenite", "tower 0.5.2", "tracing", "universaldb", @@ -4399,7 +4402,7 @@ dependencies = [ [[package]] name = "rivet-guard-core" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "async-trait", @@ -4444,7 +4447,7 @@ dependencies = [ [[package]] name = "rivet-logs" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "chrono", @@ -4458,7 +4461,7 @@ dependencies = [ [[package]] name = "rivet-metrics" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "console-subscriber", @@ -4476,7 +4479,7 @@ dependencies = [ [[package]] name = "rivet-pools" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "async-nats", @@ -4509,7 +4512,7 @@ dependencies = [ [[package]] name = "rivet-runner-protocol" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "base64 0.22.1", @@ -4524,7 +4527,7 @@ dependencies = [ [[package]] name = "rivet-runtime" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "console-subscriber", @@ -4550,7 +4553,7 @@ dependencies = [ [[package]] name = "rivet-service-manager" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "chrono", @@ -4565,7 +4568,7 @@ dependencies = [ [[package]] name = "rivet-telemetry" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "rivet-config", @@ -4577,7 +4580,7 @@ dependencies = [ [[package]] name = "rivet-term" version = "0.1.0" -source = "git+https://github.com/rivet-gg/rivet-term?rev=55e328470b68c557fb9bc8298369f90182d35b6d#55e328470b68c557fb9bc8298369f90182d35b6d" +source = "git+https://github.com/rivet-dev/rivet-term?rev=55e328470b68c557fb9bc8298369f90182d35b6d#55e328470b68c557fb9bc8298369f90182d35b6d" dependencies = [ "console", "derive_builder 0.12.0", @@ -4589,7 +4592,7 @@ dependencies = [ [[package]] name = "rivet-test-deps" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "futures-util", @@ -4607,7 +4610,7 @@ dependencies = [ [[package]] name = "rivet-test-deps-docker" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "portpicker", @@ -4638,7 +4641,7 @@ dependencies = [ [[package]] name = "rivet-types" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "gasoline", @@ -4655,7 +4658,7 @@ dependencies = [ [[package]] name = "rivet-ups-protocol" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "base64 0.22.1", @@ -4668,7 +4671,7 @@ dependencies = [ [[package]] name = "rivet-util" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "async-trait", @@ -4697,7 +4700,7 @@ dependencies = [ [[package]] name = "rivet-util-id" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "serde", "thiserror 1.0.69", @@ -4708,7 +4711,7 @@ dependencies = [ [[package]] name = "rivet-workflow-worker" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "epoxy", @@ -4929,6 +4932,12 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +[[package]] +name = "saa" +version = "5.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f895faf11c46e98547f4de603a113ca76708d4b6832dbbe3c26528b7b81aca3b" + [[package]] name = "safe_arch" version = "0.7.4" @@ -4938,6 +4947,16 @@ dependencies = [ "bytemuck", ] +[[package]] +name = "scc" +version = "3.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0b9e1890c5b17833a779c68a974f04170dfa36e3789395d17845418cc779ac" +dependencies = [ + "saa", + "sdd", +] + [[package]] name = "schannel" version = "0.1.27" @@ -5019,6 +5038,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "sdd" +version = "4.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a8729f5224c38cb041e72fa9968dd4e379d3487b85359539d31d75ed95992d8" + [[package]] name = "sealed" version = "0.4.0" @@ -6325,7 +6350,7 @@ checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" [[package]] name = "universaldb" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "async-trait", @@ -6352,7 +6377,7 @@ dependencies = [ [[package]] name = "universalpubsub" -version = "25.8.2" +version = "2.0.22-rc.1" dependencies = [ "anyhow", "async-nats", @@ -6481,9 +6506,8 @@ checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" [[package]] name = "vbare" -version = "0.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acf4d898b11572484cc064900e2a63dc88f72c621c2c52fd032b14537668702e" +version = "0.0.3" +source = "git+https://github.com/rivet-dev/vbare?rev=3ae474a0234801bb96d70bec4eddd4f2d640971e#3ae474a0234801bb96d70bec4eddd4f2d640971e" dependencies = [ "anyhow", ] diff --git a/Cargo.toml b/Cargo.toml index 4b9d52c8ce..3c55f6a56d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,6 +59,7 @@ regex = "1.4" rstest = "0.26.1" rustls-pemfile = "2.2.0" rustyline = "15.0.0" +scc = "3.3.2" serde_bare = "0.5.0" serde_html_form = "0.2.7" serde_yaml = "0.9.34" @@ -147,7 +148,7 @@ version = "4.3" features = ["derive"] [workspace.dependencies.rivet-term] -git = "https://github.com/rivet-gg/rivet-term" +git = "https://github.com/rivet-dev/rivet-term" rev = "55e328470b68c557fb9bc8298369f90182d35b6d" [workspace.dependencies.clickhouse] @@ -248,7 +249,8 @@ default-features = false features = ["ansi","fmt","json","env-filter"] [workspace.dependencies.vbare] -version = "0.0.2" +git = "https://github.com/rivet-dev/vbare" +rev = "3ae474a0234801bb96d70bec4eddd4f2d640971e" [workspace.dependencies.vbare-compiler] version = "0.0.2" diff --git a/engine/artifacts/errors/guard.websocket_pending_limit_reached.json b/engine/artifacts/errors/guard.websocket_pending_limit_reached.json new file mode 100644 index 0000000000..770c084ec3 --- /dev/null +++ b/engine/artifacts/errors/guard.websocket_pending_limit_reached.json @@ -0,0 +1,5 @@ +{ + "code": "websocket_pending_limit_reached", + "group": "guard", + "message": "Reached limit on pending websocket messages, aborting connection." +} \ No newline at end of file diff --git a/engine/artifacts/errors/guard.websocket_service_retry.json b/engine/artifacts/errors/guard.websocket_service_retry.json new file mode 100644 index 0000000000..e73bbbc507 --- /dev/null +++ b/engine/artifacts/errors/guard.websocket_service_retry.json @@ -0,0 +1,5 @@ +{ + "code": "websocket_service_retry", + "group": "guard", + "message": "WebSocket service retry." +} \ No newline at end of file diff --git a/engine/artifacts/errors/guard.websocket_service_timeout.json b/engine/artifacts/errors/guard.websocket_service_timeout.json new file mode 100644 index 0000000000..41adc87d1c --- /dev/null +++ b/engine/artifacts/errors/guard.websocket_service_timeout.json @@ -0,0 +1,5 @@ +{ + "code": "websocket_service_timeout", + "group": "guard", + "message": "WebSocket service timed out." +} \ No newline at end of file diff --git a/engine/artifacts/openapi.json b/engine/artifacts/openapi.json index bb4dbf7b3b..517c5fa80b 100644 --- a/engine/artifacts/openapi.json +++ b/engine/artifacts/openapi.json @@ -11,7 +11,7 @@ "name": "Apache-2.0", "identifier": "Apache-2.0" }, - "version": "25.8.2" + "version": "2.0.22-rc.1" }, "paths": { "/actors": { diff --git a/engine/docker/dev-host/grafana/dashboards/traces.json b/engine/docker/dev-host/grafana/dashboards/traces.json index 54c2d0aefb..9bb36ad163 100644 --- a/engine/docker/dev-host/grafana/dashboards/traces.json +++ b/engine/docker/dev-host/grafana/dashboards/traces.json @@ -173,7 +173,7 @@ }, "pluginVersion": "4.0.6", "queryType": "timeseries", - "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n ServiceName,\r\n count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n", + "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n ServiceName,\r\n count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n", "refId": "A" } ], @@ -431,7 +431,7 @@ }, "pluginVersion": "4.9.0", "queryType": "table", - "rawSql": "SELECT\r\n (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n min(Timestamp) as Ts,\r\n TraceId as `Trace ID`,\r\n argMin(ServiceName, Timestamp) as `Service Name`,\r\n argMin(SpanName, Timestamp) as `Span Name`,\r\n argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND ServiceName != 'loadgenerator'\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n", + "rawSql": "SELECT\r\n (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n min(Timestamp) as Ts,\r\n TraceId as `Trace ID`,\r\n argMin(ServiceName, Timestamp) as `Service Name`,\r\n argMin(SpanName, Timestamp) as `Span Name`,\r\n argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND ServiceName != 'loadgenerator'\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n", "refId": "A" } ], @@ -581,7 +581,7 @@ }, "pluginVersion": "4.0.6", "queryType": "traces", - "rawSql": "WITH\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\t$__conditionalAll(traceID, $trace_id) AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000", + "rawSql": "WITH\n\t'${trace_id}' as trace_id,\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\ttraceID = trace_id AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000", "refId": "A" } ], @@ -722,7 +722,7 @@ }, "pluginVersion": "4.0.6", "queryType": "timeseries", - "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n count(*) as ` `,\r\n ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(TraceId, $trace_id)\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id)\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000", + "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n count(*) as ` `,\r\n ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(TraceId IN (${trace_id:singlequote}), $trace_id)\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000", "refId": "A" } ], @@ -886,14 +886,14 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Span", "multi": true, "name": "span_name", "options": [], - "query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" @@ -908,14 +908,14 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Ray ID", "multi": true, "name": "ray_id", "options": [], - "query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" @@ -930,29 +930,17 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Workflow ID", "multi": true, "name": "workflow_id", "options": [], - "query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" - }, - { - "current": { - "text": "30", - "value": "30" - }, - "hide": 2, - "label": "Metric Export Interval (seconds)", - "name": "metric_interval", - "query": "30", - "skipUrlSync": true, - "type": "constant" } ] }, diff --git a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/traces.json b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/traces.json index 54c2d0aefb..9bb36ad163 100644 --- a/engine/docker/dev-multidc-multinode/core/grafana/dashboards/traces.json +++ b/engine/docker/dev-multidc-multinode/core/grafana/dashboards/traces.json @@ -173,7 +173,7 @@ }, "pluginVersion": "4.0.6", "queryType": "timeseries", - "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n ServiceName,\r\n count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n", + "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n ServiceName,\r\n count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n", "refId": "A" } ], @@ -431,7 +431,7 @@ }, "pluginVersion": "4.9.0", "queryType": "table", - "rawSql": "SELECT\r\n (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n min(Timestamp) as Ts,\r\n TraceId as `Trace ID`,\r\n argMin(ServiceName, Timestamp) as `Service Name`,\r\n argMin(SpanName, Timestamp) as `Span Name`,\r\n argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND ServiceName != 'loadgenerator'\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n", + "rawSql": "SELECT\r\n (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n min(Timestamp) as Ts,\r\n TraceId as `Trace ID`,\r\n argMin(ServiceName, Timestamp) as `Service Name`,\r\n argMin(SpanName, Timestamp) as `Span Name`,\r\n argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND ServiceName != 'loadgenerator'\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n", "refId": "A" } ], @@ -581,7 +581,7 @@ }, "pluginVersion": "4.0.6", "queryType": "traces", - "rawSql": "WITH\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\t$__conditionalAll(traceID, $trace_id) AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000", + "rawSql": "WITH\n\t'${trace_id}' as trace_id,\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\ttraceID = trace_id AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000", "refId": "A" } ], @@ -722,7 +722,7 @@ }, "pluginVersion": "4.0.6", "queryType": "timeseries", - "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n count(*) as ` `,\r\n ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(TraceId, $trace_id)\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id)\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000", + "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n count(*) as ` `,\r\n ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(TraceId IN (${trace_id:singlequote}), $trace_id)\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000", "refId": "A" } ], @@ -886,14 +886,14 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Span", "multi": true, "name": "span_name", "options": [], - "query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" @@ -908,14 +908,14 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Ray ID", "multi": true, "name": "ray_id", "options": [], - "query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" @@ -930,29 +930,17 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Workflow ID", "multi": true, "name": "workflow_id", "options": [], - "query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" - }, - { - "current": { - "text": "30", - "value": "30" - }, - "hide": 2, - "label": "Metric Export Interval (seconds)", - "name": "metric_interval", - "query": "30", - "skipUrlSync": true, - "type": "constant" } ] }, diff --git a/engine/docker/dev-multidc/core/grafana/dashboards/traces.json b/engine/docker/dev-multidc/core/grafana/dashboards/traces.json index 54c2d0aefb..9bb36ad163 100644 --- a/engine/docker/dev-multidc/core/grafana/dashboards/traces.json +++ b/engine/docker/dev-multidc/core/grafana/dashboards/traces.json @@ -173,7 +173,7 @@ }, "pluginVersion": "4.0.6", "queryType": "timeseries", - "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n ServiceName,\r\n count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n", + "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n ServiceName,\r\n count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n", "refId": "A" } ], @@ -431,7 +431,7 @@ }, "pluginVersion": "4.9.0", "queryType": "table", - "rawSql": "SELECT\r\n (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n min(Timestamp) as Ts,\r\n TraceId as `Trace ID`,\r\n argMin(ServiceName, Timestamp) as `Service Name`,\r\n argMin(SpanName, Timestamp) as `Span Name`,\r\n argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND ServiceName != 'loadgenerator'\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n", + "rawSql": "SELECT\r\n (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n min(Timestamp) as Ts,\r\n TraceId as `Trace ID`,\r\n argMin(ServiceName, Timestamp) as `Service Name`,\r\n argMin(SpanName, Timestamp) as `Span Name`,\r\n argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND ServiceName != 'loadgenerator'\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n", "refId": "A" } ], @@ -581,7 +581,7 @@ }, "pluginVersion": "4.0.6", "queryType": "traces", - "rawSql": "WITH\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\t$__conditionalAll(traceID, $trace_id) AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000", + "rawSql": "WITH\n\t'${trace_id}' as trace_id,\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\ttraceID = trace_id AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000", "refId": "A" } ], @@ -722,7 +722,7 @@ }, "pluginVersion": "4.0.6", "queryType": "timeseries", - "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n count(*) as ` `,\r\n ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(TraceId, $trace_id)\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id)\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000", + "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n count(*) as ` `,\r\n ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(TraceId IN (${trace_id:singlequote}), $trace_id)\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000", "refId": "A" } ], @@ -886,14 +886,14 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Span", "multi": true, "name": "span_name", "options": [], - "query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" @@ -908,14 +908,14 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Ray ID", "multi": true, "name": "ray_id", "options": [], - "query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" @@ -930,29 +930,17 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Workflow ID", "multi": true, "name": "workflow_id", "options": [], - "query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" - }, - { - "current": { - "text": "30", - "value": "30" - }, - "hide": 2, - "label": "Metric Export Interval (seconds)", - "name": "metric_interval", - "query": "30", - "skipUrlSync": true, - "type": "constant" } ] }, diff --git a/engine/docker/dev-multinode/grafana/dashboards/traces.json b/engine/docker/dev-multinode/grafana/dashboards/traces.json index 54c2d0aefb..9bb36ad163 100644 --- a/engine/docker/dev-multinode/grafana/dashboards/traces.json +++ b/engine/docker/dev-multinode/grafana/dashboards/traces.json @@ -173,7 +173,7 @@ }, "pluginVersion": "4.0.6", "queryType": "timeseries", - "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n ServiceName,\r\n count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n", + "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n ServiceName,\r\n count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n", "refId": "A" } ], @@ -431,7 +431,7 @@ }, "pluginVersion": "4.9.0", "queryType": "table", - "rawSql": "SELECT\r\n (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n min(Timestamp) as Ts,\r\n TraceId as `Trace ID`,\r\n argMin(ServiceName, Timestamp) as `Service Name`,\r\n argMin(SpanName, Timestamp) as `Span Name`,\r\n argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND ServiceName != 'loadgenerator'\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n", + "rawSql": "SELECT\r\n (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n min(Timestamp) as Ts,\r\n TraceId as `Trace ID`,\r\n argMin(ServiceName, Timestamp) as `Service Name`,\r\n argMin(SpanName, Timestamp) as `Span Name`,\r\n argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND ServiceName != 'loadgenerator'\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n", "refId": "A" } ], @@ -581,7 +581,7 @@ }, "pluginVersion": "4.0.6", "queryType": "traces", - "rawSql": "WITH\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\t$__conditionalAll(traceID, $trace_id) AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000", + "rawSql": "WITH\n\t'${trace_id}' as trace_id,\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\ttraceID = trace_id AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000", "refId": "A" } ], @@ -722,7 +722,7 @@ }, "pluginVersion": "4.0.6", "queryType": "timeseries", - "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n count(*) as ` `,\r\n ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(TraceId, $trace_id)\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id)\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000", + "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n count(*) as ` `,\r\n ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(TraceId IN (${trace_id:singlequote}), $trace_id)\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000", "refId": "A" } ], @@ -886,14 +886,14 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Span", "multi": true, "name": "span_name", "options": [], - "query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" @@ -908,14 +908,14 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Ray ID", "multi": true, "name": "ray_id", "options": [], - "query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" @@ -930,29 +930,17 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Workflow ID", "multi": true, "name": "workflow_id", "options": [], - "query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" - }, - { - "current": { - "text": "30", - "value": "30" - }, - "hide": 2, - "label": "Metric Export Interval (seconds)", - "name": "metric_interval", - "query": "30", - "skipUrlSync": true, - "type": "constant" } ] }, diff --git a/engine/docker/dev/docker-compose.yml b/engine/docker/dev/docker-compose.yml index 313c6d0c4e..6733fda6b5 100644 --- a/engine/docker/dev/docker-compose.yml +++ b/engine/docker/dev/docker-compose.yml @@ -130,6 +130,8 @@ services: networks: - rivet-network - rivet-network-to-core + ports: + - '4317:4317' otel-collector-client: image: otel/opentelemetry-collector-contrib:latest restart: unless-stopped diff --git a/engine/docker/dev/grafana/dashboards/traces.json b/engine/docker/dev/grafana/dashboards/traces.json index 54c2d0aefb..9bb36ad163 100644 --- a/engine/docker/dev/grafana/dashboards/traces.json +++ b/engine/docker/dev/grafana/dashboards/traces.json @@ -173,7 +173,7 @@ }, "pluginVersion": "4.0.6", "queryType": "timeseries", - "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n ServiceName,\r\n count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n", + "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n ServiceName,\r\n count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n", "refId": "A" } ], @@ -431,7 +431,7 @@ }, "pluginVersion": "4.9.0", "queryType": "table", - "rawSql": "SELECT\r\n (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n min(Timestamp) as Ts,\r\n TraceId as `Trace ID`,\r\n argMin(ServiceName, Timestamp) as `Service Name`,\r\n argMin(SpanName, Timestamp) as `Span Name`,\r\n argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND ServiceName != 'loadgenerator'\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n", + "rawSql": "SELECT\r\n (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n min(Timestamp) as Ts,\r\n TraceId as `Trace ID`,\r\n argMin(ServiceName, Timestamp) as `Service Name`,\r\n argMin(SpanName, Timestamp) as `Span Name`,\r\n argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND ServiceName != 'loadgenerator'\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n", "refId": "A" } ], @@ -581,7 +581,7 @@ }, "pluginVersion": "4.0.6", "queryType": "traces", - "rawSql": "WITH\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\t$__conditionalAll(traceID, $trace_id) AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000", + "rawSql": "WITH\n\t'${trace_id}' as trace_id,\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\ttraceID = trace_id AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000", "refId": "A" } ], @@ -722,7 +722,7 @@ }, "pluginVersion": "4.0.6", "queryType": "timeseries", - "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n count(*) as ` `,\r\n ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(TraceId, $trace_id)\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id)\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000", + "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n count(*) as ` `,\r\n ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(TraceId IN (${trace_id:singlequote}), $trace_id)\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000", "refId": "A" } ], @@ -886,14 +886,14 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Span", "multi": true, "name": "span_name", "options": [], - "query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" @@ -908,14 +908,14 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Ray ID", "multi": true, "name": "ray_id", "options": [], - "query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" @@ -930,29 +930,17 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Workflow ID", "multi": true, "name": "workflow_id", "options": [], - "query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" - }, - { - "current": { - "text": "30", - "value": "30" - }, - "hide": 2, - "label": "Metric Export Interval (seconds)", - "name": "metric_interval", - "query": "30", - "skipUrlSync": true, - "type": "constant" } ] }, diff --git a/engine/docker/template/grafana-dashboards/traces.json b/engine/docker/template/grafana-dashboards/traces.json index 54c2d0aefb..9bb36ad163 100644 --- a/engine/docker/template/grafana-dashboards/traces.json +++ b/engine/docker/template/grafana-dashboards/traces.json @@ -173,7 +173,7 @@ }, "pluginVersion": "4.0.6", "queryType": "timeseries", - "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n ServiceName,\r\n count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n", + "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n ServiceName,\r\n count() as ` `\r\nFROM otel.otel_traces\r\nWHERE\r\n ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\nGROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000\r\n", "refId": "A" } ], @@ -431,7 +431,7 @@ }, "pluginVersion": "4.9.0", "queryType": "table", - "rawSql": "SELECT\r\n (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n min(Timestamp) as Ts,\r\n TraceId as `Trace ID`,\r\n argMin(ServiceName, Timestamp) as `Service Name`,\r\n argMin(SpanName, Timestamp) as `Span Name`,\r\n argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND ServiceName != 'loadgenerator'\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n", + "rawSql": "SELECT\r\n (argMin(StatusCode, Timestamp) = 'Error' ? '⚠️' : '') as ` `,\r\n min(Timestamp) as Ts,\r\n TraceId as `Trace ID`,\r\n argMin(ServiceName, Timestamp) as `Service Name`,\r\n argMin(SpanName, Timestamp) as `Span Name`,\r\n argMin(coalesce(NULLIF(SpanAttributes['uri'], ''), NULLIF(SpanAttributes['workflow_id'], ''), SpanAttributes['actor_id']), Timestamp) as `URI/workflow_id/actor_id`,\r\n divide(max(Duration), 1000000) as Duration\r\nFROM otel.otel_traces\r\nWHERE\r\n ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND ServiceName != 'loadgenerator'\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\nGROUP BY TraceId\r\nORDER BY Duration DESC\r\nLIMIT 100\r\n", "refId": "A" } ], @@ -581,7 +581,7 @@ }, "pluginVersion": "4.0.6", "queryType": "traces", - "rawSql": "WITH\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE $__conditionalAll(TraceId, $trace_id)) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\t$__conditionalAll(traceID, $trace_id) AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000", + "rawSql": "WITH\n\t'${trace_id}' as trace_id,\n\t(SELECT min(Start) FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_start,\n\t(SELECT max(End) + 1 FROM otel.otel_traces_trace_id_ts WHERE TraceId = trace_id) as trace_end\nSELECT\n\tTraceId as traceID,\n\tSpanId as spanID,\n\tParentSpanId as parentSpanID,\n\tServiceName as serviceName,\n\tSpanName as operationName, Timestamp as startTime,\n\tmultiply(Duration, 0.000001) as duration,\n\tarrayMap(key -> map('key', key, 'value', SpanAttributes[key]), mapKeys(SpanAttributes)) as tags,\n\tarrayMap(key -> map('key', key, 'value', ResourceAttributes[key]), mapKeys(ResourceAttributes)) as serviceTags,\n\tarrayMap((name, timestamp, attributes) -> tuple(name, toString(multiply(toUnixTimestamp64Nano(timestamp), 0.000001)), arrayMap( key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(name String, timestamp String, fields Array(Map(String, String))), `Events.Name`, `Events.Timestamp`, `Events.Attributes`) AS logs,\n\tarrayMap((traceID, spanID, attributes) -> tuple(traceID, spanID, arrayMap(key -> map('key', key, 'value', attributes[key]), mapKeys(attributes)))::Tuple(traceID String, spanID String, tags Array(Map(String, String))), `Links.TraceId`, `Links.SpanId`, `Links.Attributes`) AS references\nFROM otel.otel_traces\nWHERE\n\ttraceID = trace_id AND startTime >= trace_start AND startTime <= trace_end AND ( Duration > 0 )\nORDER BY Timestamp DESC, Duration DESC\nLIMIT 1000", "refId": "A" } ], @@ -722,7 +722,7 @@ }, "pluginVersion": "4.0.6", "queryType": "timeseries", - "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n count(*) as ` `,\r\n ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(TraceId, $trace_id)\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND $__conditionalAll(ServiceName, $service_name)\r\n AND $__conditionalAll(SpanName, $span_name)\r\n AND (($ray_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['ray_id'], $ray_id)\r\n AND (($workflow_id, NULL).1 = 'All' ? true : $__conditionalAll(SpanAttributes['workflow_id'], $workflow_id))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000", + "rawSql": "SELECT\r\n $__timeInterval(Timestamp) as time,\r\n count(*) as ` `,\r\n ServiceName\r\nFROM otel.otel_traces\r\nWHERE\r\n $__conditionalAll(TraceId IN (${trace_id:singlequote}), $trace_id)\r\n AND ( Timestamp >= $__fromTime AND Timestamp <= $__toTime )\r\n AND ServiceName IN (${service_name:singlequote})\r\n AND SpanName IN (${span_name:singlequote})\r\n AND (($ray_id, NULL).1 = 'All' ? true : SpanAttributes['ray_id'] IN (${ray_id:singlequote}))\r\n AND (($workflow_id, NULL).1 = 'All' ? true : SpanAttributes['workflow_id'] IN (${workflow_id:singlequote}))\r\n AND StatusCode IN ('Error', 'STATUS_CODE_ERROR')\r\n AND ServiceName != 'loadgenerator' GROUP BY ServiceName, time\r\nORDER BY time ASC\r\nLIMIT 100000", "refId": "A" } ], @@ -886,14 +886,14 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Span", "multi": true, "name": "span_name", "options": [], - "query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanName FROM otel.otel_traces WHERE ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" @@ -908,14 +908,14 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Ray ID", "multi": true, "name": "ray_id", "options": [], - "query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanAttributes['ray_id'] FROM otel.otel_traces WHERE SpanAttributes['ray_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" @@ -930,29 +930,17 @@ "type": "grafana-clickhouse-datasource", "uid": "clickhouse" }, - "definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "definition": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "description": "", "includeAll": true, "label": "Workflow ID", "multi": true, "name": "workflow_id", "options": [], - "query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND $__conditionalAll(ServiceName, $service_name) LIMIT 1000;", + "query": "SELECT DISTINCT SpanAttributes['workflow_id'] FROM otel.otel_traces WHERE SpanAttributes['workflow_id'] != '' AND ServiceName IN (${service_name:singlequote}) LIMIT 1000;", "refresh": 1, "regex": "", "type": "query" - }, - { - "current": { - "text": "30", - "value": "30" - }, - "hide": 2, - "label": "Metric Export Interval (seconds)", - "name": "metric_interval", - "query": "30", - "skipUrlSync": true, - "type": "constant" } ] }, diff --git a/engine/packages/api-public/src/actors/get_or_create.rs b/engine/packages/api-public/src/actors/get_or_create.rs index 8f0403eef3..3fd6071442 100644 --- a/engine/packages/api-public/src/actors/get_or_create.rs +++ b/engine/packages/api-public/src/actors/get_or_create.rs @@ -1,8 +1,5 @@ use anyhow::Result; -use axum::{ - http::HeaderMap, - response::{IntoResponse, Response}, -}; +use axum::response::{IntoResponse, Response}; use rivet_api_builder::{ ApiError, extract::{Extension, Json, Query}, @@ -77,11 +74,10 @@ pub struct GetOrCreateResponse { )] pub async fn get_or_create( Extension(ctx): Extension, - headers: HeaderMap, Query(query): Query, Json(body): Json, ) -> Response { - match get_or_create_inner(ctx, headers, query, body).await { + match get_or_create_inner(ctx, query, body).await { Ok(response) => Json(response).into_response(), Err(err) => ApiError::from(err).into_response(), } @@ -90,7 +86,6 @@ pub async fn get_or_create( #[tracing::instrument(skip_all)] async fn get_or_create_inner( ctx: ApiCtx, - headers: HeaderMap, query: GetOrCreateQuery, body: GetOrCreateRequest, ) -> Result { diff --git a/engine/packages/api-public/src/health.rs b/engine/packages/api-public/src/health.rs index d74a60827b..9528d716f8 100644 --- a/engine/packages/api-public/src/health.rs +++ b/engine/packages/api-public/src/health.rs @@ -87,7 +87,7 @@ async fn fanout_inner(ctx: ApiCtx) -> Result { } } else { // Remote datacenter - HTTP request - match send_health_checks(&ctx, &dc).await { + match send_health_checks(&dc).await { Ok(response) => DatacenterHealth { datacenter_label: dc.datacenter_label, datacenter_name: dc.name.clone(), @@ -129,7 +129,6 @@ async fn fanout_inner(ctx: ApiCtx) -> Result { #[tracing::instrument(skip_all)] async fn send_health_checks( - ctx: &ApiCtx, dc: &rivet_config::config::topology::Datacenter, ) -> Result { let client = rivet_pools::reqwest::client().await?; diff --git a/engine/packages/api-public/src/metadata.rs b/engine/packages/api-public/src/metadata.rs index 694143b042..c4a7a2cb64 100644 --- a/engine/packages/api-public/src/metadata.rs +++ b/engine/packages/api-public/src/metadata.rs @@ -1,6 +1,6 @@ use axum::Json; use axum::response::IntoResponse; -use rivet_api_builder::{ApiError, extract::Extension}; +use rivet_api_builder::extract::Extension; use serde_json::json; use crate::ctx::ApiCtx; diff --git a/engine/packages/api-public/src/runner_configs/delete.rs b/engine/packages/api-public/src/runner_configs/delete.rs index caa966cfd0..551986683f 100644 --- a/engine/packages/api-public/src/runner_configs/delete.rs +++ b/engine/packages/api-public/src/runner_configs/delete.rs @@ -1,4 +1,4 @@ -use anyhow::{Context, Result}; +use anyhow::Result; use axum::response::{IntoResponse, Response}; use futures_util::{StreamExt, TryStreamExt}; use rivet_api_builder::{ diff --git a/engine/packages/api-public/src/runner_configs/utils.rs b/engine/packages/api-public/src/runner_configs/utils.rs index cb2d0939c3..85c5afbb96 100644 --- a/engine/packages/api-public/src/runner_configs/utils.rs +++ b/engine/packages/api-public/src/runner_configs/utils.rs @@ -114,7 +114,7 @@ pub async fn fetch_serverless_runner_metadata( }); } - let payload = serde_json::from_str::(&body_raw).map_err(|err| { + let payload = serde_json::from_str::(&body_raw).map_err(|_| { ServerlessMetadataError::InvalidResponseJson { body: body_for_user, } diff --git a/engine/packages/epoxy/src/http_client.rs b/engine/packages/epoxy/src/http_client.rs index 779670c57c..1e325b0bbd 100644 --- a/engine/packages/epoxy/src/http_client.rs +++ b/engine/packages/epoxy/src/http_client.rs @@ -135,7 +135,7 @@ pub async fn send_message_to_address( let client = rivet_pools::reqwest::client().await?; // Create the request - let request = versioned::Request::latest(request); + let request = versioned::Request::wrap_latest(request); // Send the request let response_result = client diff --git a/engine/packages/epoxy/src/http_routes.rs b/engine/packages/epoxy/src/http_routes.rs index 79ba36d00d..e2b987f542 100644 --- a/engine/packages/epoxy/src/http_routes.rs +++ b/engine/packages/epoxy/src/http_routes.rs @@ -1,6 +1,6 @@ use anyhow::*; use axum::body::Bytes; -use epoxy_protocol::{protocol, versioned}; +use epoxy_protocol::versioned; use rivet_api_builder::prelude::*; use vbare::OwnedVersionedData; @@ -30,5 +30,5 @@ pub async fn message(ctx: ApiCtx, path: VersionedPath, _query: (), body: Bytes) // Process message directly using ops let response = crate::replica::message_request::message_request(&ctx, request).await?; - versioned::Response::latest(response).serialize(path.version) + versioned::Response::wrap_latest(response).serialize(path.version) } diff --git a/engine/packages/epoxy/src/keys/keys.rs b/engine/packages/epoxy/src/keys/keys.rs index bbe22ba2d3..2e8ccf1729 100644 --- a/engine/packages/epoxy/src/keys/keys.rs +++ b/engine/packages/epoxy/src/keys/keys.rs @@ -1,5 +1,4 @@ use anyhow::*; -use epoxy_protocol::protocol::ReplicaId; use std::result::Result::Ok; use universaldb::prelude::*; diff --git a/engine/packages/epoxy/src/keys/replica.rs b/engine/packages/epoxy/src/keys/replica.rs index e65788d3a0..1bd6f95997 100644 --- a/engine/packages/epoxy/src/keys/replica.rs +++ b/engine/packages/epoxy/src/keys/replica.rs @@ -52,7 +52,7 @@ impl FormalKey for LogEntryKey { } fn serialize(&self, value: Self::Value) -> Result> { - epoxy_protocol::versioned::LogEntry::latest(value) + epoxy_protocol::versioned::LogEntry::wrap_latest(value) .serialize_with_embedded_version(epoxy_protocol::PROTOCOL_VERSION) } } @@ -162,7 +162,7 @@ impl FormalKey for ConfigKey { } fn serialize(&self, value: Self::Value) -> Result> { - epoxy_protocol::versioned::ClusterConfig::latest(value) + epoxy_protocol::versioned::ClusterConfig::wrap_latest(value) .serialize_with_embedded_version(epoxy_protocol::PROTOCOL_VERSION) } } @@ -223,7 +223,7 @@ impl FormalKey for CurrentBallotKey { } fn serialize(&self, value: Self::Value) -> Result> { - epoxy_protocol::versioned::Ballot::latest(value) + epoxy_protocol::versioned::Ballot::wrap_latest(value) .serialize_with_embedded_version(epoxy_protocol::PROTOCOL_VERSION) } } @@ -262,7 +262,7 @@ impl FormalKey for InstanceBallotKey { } fn serialize(&self, value: Self::Value) -> Result> { - epoxy_protocol::versioned::Ballot::latest(value) + epoxy_protocol::versioned::Ballot::wrap_latest(value) .serialize_with_embedded_version(epoxy_protocol::PROTOCOL_VERSION) } } diff --git a/engine/packages/epoxy/src/ops/explicit_prepare.rs b/engine/packages/epoxy/src/ops/explicit_prepare.rs index e821303eff..2445f9d034 100644 --- a/engine/packages/epoxy/src/ops/explicit_prepare.rs +++ b/engine/packages/epoxy/src/ops/explicit_prepare.rs @@ -3,7 +3,7 @@ use epoxy_protocol::protocol::{self, ReplicaId}; use gas::prelude::*; use rivet_api_builder::ApiCtx; -use crate::{http_client, replica, types, utils}; +use crate::{http_client, replica, utils}; #[derive(Debug)] pub struct Input { @@ -79,15 +79,8 @@ pub async fn epoxy_explicit_prepare( let result = match analyze_prepare_responses(&highest_ballot_responses, instance) { PrepareDecision::Commit(payload) => { // EPaxos Step 29: Run Commit phase - let result = crate::ops::propose::commit( - ctx, - &config, - replica_id, - &quorum_members, - payload, - false, - ) - .await?; + let result = + crate::ops::propose::commit(ctx, &config, replica_id, payload, false).await?; convert_proposal_result(result) } PrepareDecision::Accept(payload) => { diff --git a/engine/packages/epoxy/src/ops/kv/get_local.rs b/engine/packages/epoxy/src/ops/kv/get_local.rs index 64b3f95116..3df54bdee1 100644 --- a/engine/packages/epoxy/src/ops/kv/get_local.rs +++ b/engine/packages/epoxy/src/ops/kv/get_local.rs @@ -1,7 +1,6 @@ use anyhow::*; use epoxy_protocol::protocol::ReplicaId; use gas::prelude::*; -use rivet_api_builder::prelude::*; use universaldb::utils::{FormalKey, IsolationLevel::*}; use crate::keys; @@ -30,15 +29,12 @@ pub async fn epoxy_kv_get_local(ctx: &OperationCtx, input: &Input) -> Result Resul let kv_key = kv_key.clone(); let cache_key = cache_key.clone(); async move { - (async move { - let (value, cache_value) = tokio::try_join!( - async { - let v = tx.get(&packed_key, Serializable).await?; - if let Some(ref bytes) = v { - Ok(Some(kv_key.deserialize(bytes)?)) - } else { - Ok(None) - } - }, - async { - let v = tx.get(&packed_cache_key, Serializable).await?; - if let Some(ref bytes) = v { - Ok(Some(cache_key.deserialize(bytes)?)) - } else { - Ok(None) - } + let (value, cache_value) = tokio::try_join!( + async { + let v = tx.get(&packed_key, Serializable).await?; + if let Some(ref bytes) = v { + Ok(Some(kv_key.deserialize(bytes)?)) + } else { + Ok(None) } - )?; + }, + async { + let v = tx.get(&packed_cache_key, Serializable).await?; + if let Some(ref bytes) = v { + Ok(Some(cache_key.deserialize(bytes)?)) + } else { + Ok(None) + } + } + )?; - Ok(value.or(cache_value)) - }) - .await + Ok(value.or(cache_value)) } }) .custom_instrument(tracing::info_span!("get_optimistic_tx")) @@ -134,13 +131,11 @@ pub async fn epoxy_kv_get_optimistic(ctx: &OperationCtx, input: &Input) -> Resul let packed_cache_key = packed_cache_key.clone(); let cache_key = cache_key.clone(); let value_to_cache = value.clone(); + async move { - (async move { - let serialized = cache_key.serialize(value_to_cache)?; - tx.set(&packed_cache_key, &serialized); - Ok(()) - }) - .await + let serialized = cache_key.serialize(value_to_cache)?; + tx.set(&packed_cache_key, &serialized); + Ok(()) } }) .custom_instrument(tracing::info_span!("cache_value_tx")) diff --git a/engine/packages/epoxy/src/ops/kv/mod.rs b/engine/packages/epoxy/src/ops/kv/mod.rs index e30c032b35..aac85283e3 100644 --- a/engine/packages/epoxy/src/ops/kv/mod.rs +++ b/engine/packages/epoxy/src/ops/kv/mod.rs @@ -1,2 +1,3 @@ pub mod get_local; pub mod get_optimistic; +pub mod purge_local; diff --git a/engine/packages/epoxy/src/ops/kv/purge_local.rs b/engine/packages/epoxy/src/ops/kv/purge_local.rs new file mode 100644 index 0000000000..c1c2b6731e --- /dev/null +++ b/engine/packages/epoxy/src/ops/kv/purge_local.rs @@ -0,0 +1,28 @@ +use anyhow::*; +use epoxy_protocol::protocol::ReplicaId; +use gas::prelude::*; + +use crate::keys; + +#[derive(Debug)] +pub struct Input { + pub replica_id: ReplicaId, + pub keys: Vec>, +} + +#[operation] +pub async fn epoxy_kv_purge_local(ctx: &OperationCtx, input: &Input) -> Result<()> { + ctx.udb()? + .run(|tx| async move { + let tx = tx.with_subspace(keys::subspace(input.replica_id)); + + for key in &input.keys { + tx.delete(&keys::keys::KvOptimisticCacheKey::new(key.clone())); + } + + Ok(()) + }) + .await?; + + Ok(()) +} diff --git a/engine/packages/epoxy/src/ops/propose.rs b/engine/packages/epoxy/src/ops/propose.rs index eea9bcda6f..3435aa97d8 100644 --- a/engine/packages/epoxy/src/ops/propose.rs +++ b/engine/packages/epoxy/src/ops/propose.rs @@ -1,8 +1,9 @@ use anyhow::*; +use base64::Engine; +use base64::engine::general_purpose::STANDARD as BASE64; use epoxy_protocol::protocol::{self, Path, Payload, ReplicaId}; use gas::prelude::*; use rivet_api_builder::prelude::*; -use rivet_config::Config; use crate::{http_client, replica, utils}; @@ -69,15 +70,7 @@ pub async fn epoxy_propose(ctx: &OperationCtx, input: &Input) -> Result { - commit( - ctx, - &config, - replica_id, - &quorum_members, - payload, - input.purge_cache, - ) - .await + commit(ctx, &config, replica_id, payload, input.purge_cache).await } Path::PathSlow(protocol::PathSlow { payload }) => { run_paxos_accept( @@ -126,15 +119,7 @@ pub async fn run_paxos_accept( // EPaxos Step 20 if quorum >= utils::calculate_quorum(quorum_members.len(), utils::QuorumType::Slow) { - commit( - ctx, - &config, - replica_id, - &quorum_members, - payload_for_accepts, - purge_cache, - ) - .await + commit(ctx, &config, replica_id, payload_for_accepts, purge_cache).await } else { Ok(ProposalResult::ConsensusFailed) } @@ -145,7 +130,6 @@ pub async fn commit( ctx: &OperationCtx, config: &protocol::ClusterConfig, replica_id: ReplicaId, - quorum_members: &[ReplicaId], payload: Payload, purge_cache: bool, ) -> Result { @@ -184,6 +168,27 @@ pub async fn commit( } }); + if purge_cache { + let keys = payload + .proposal + .commands + .iter() + .map(replica::utils::extract_key_from_command) + .flatten() + .map(|key| BASE64.encode(key)) + .collect::>(); + + // Purge optimistic cache for all dcs + if !keys.is_empty() { + let ctx = ctx.clone(); + tokio::spawn(async move { + if let Err(err) = purge_optimistic_cache(ctx, keys).await { + tracing::error!(?err, "failed purging optimistic cache"); + } + }); + } + } + if let Some(cmd_err) = cmd_err { Ok(ProposalResult::CommandError(cmd_err)) } else { @@ -326,3 +331,22 @@ async fn send_commits( Ok(()) } + +async fn purge_optimistic_cache(ctx: OperationCtx, keys: Vec) -> Result<()> { + for dc in &ctx.config().topology().datacenters { + let workflow_id = ctx + .workflow(crate::workflows::purger::Input { + replica_id: dc.datacenter_label as u64, + }) + .tag("replica_id", dc.datacenter_label as u64) + .unique() + .dispatch() + .await?; + ctx.signal(crate::workflows::purger::Purge { keys: keys.clone() }) + .to_workflow_id(workflow_id) + .send() + .await?; + } + + Ok(()) +} diff --git a/engine/packages/epoxy/src/ops/read_cluster_config.rs b/engine/packages/epoxy/src/ops/read_cluster_config.rs index 62a23c2092..366d1062b6 100644 --- a/engine/packages/epoxy/src/ops/read_cluster_config.rs +++ b/engine/packages/epoxy/src/ops/read_cluster_config.rs @@ -1,5 +1,5 @@ use anyhow::*; -use epoxy_protocol::protocol::{self, ReplicaId}; +use epoxy_protocol::protocol::{self}; use gas::prelude::*; use crate::utils; diff --git a/engine/packages/epoxy/src/replica/lead_consensus.rs b/engine/packages/epoxy/src/replica/lead_consensus.rs index 8103e78868..5af4edda10 100644 --- a/engine/packages/epoxy/src/replica/lead_consensus.rs +++ b/engine/packages/epoxy/src/replica/lead_consensus.rs @@ -4,7 +4,7 @@ use universaldb::Transaction; use universaldb::utils::{FormalKey, IsolationLevel::*}; use crate::keys; -use crate::replica::{ballot, messages, utils}; +use crate::replica::{ballot, utils}; #[tracing::instrument(skip_all)] pub async fn lead_consensus( diff --git a/engine/packages/epoxy/src/replica/message_request.rs b/engine/packages/epoxy/src/replica/message_request.rs index 4e41d1abf1..7466c3331c 100644 --- a/engine/packages/epoxy/src/replica/message_request.rs +++ b/engine/packages/epoxy/src/replica/message_request.rs @@ -1,5 +1,5 @@ use anyhow::*; -use epoxy_protocol::protocol::{self, ReplicaId}; +use epoxy_protocol::protocol::{self}; use gas::prelude::*; use rivet_api_builder::prelude::*; @@ -150,6 +150,16 @@ pub async fn message_request( value: result.value, }) } + protocol::RequestKind::KvPurgeRequest(req) => { + // Handle KV purge request + ctx.op(ops::kv::purge_local::Input { + replica_id: current_replica_id, + keys: req.keys.clone(), + }) + .await?; + + protocol::ResponseKind::KvPurgeResponse + } }; Ok(protocol::Response { kind }) diff --git a/engine/packages/epoxy/src/replica/messages/accept.rs b/engine/packages/epoxy/src/replica/messages/accept.rs index f123796d0d..af1588dd1f 100644 --- a/engine/packages/epoxy/src/replica/messages/accept.rs +++ b/engine/packages/epoxy/src/replica/messages/accept.rs @@ -2,7 +2,7 @@ use anyhow::{Result, ensure}; use epoxy_protocol::protocol; use universaldb::Transaction; -use crate::replica::{ballot, messages}; +use crate::replica::ballot; #[tracing::instrument(skip_all)] pub async fn accept( diff --git a/engine/packages/epoxy/src/replica/messages/accepted.rs b/engine/packages/epoxy/src/replica/messages/accepted.rs index bcbb0a0147..b91d5e24e4 100644 --- a/engine/packages/epoxy/src/replica/messages/accepted.rs +++ b/engine/packages/epoxy/src/replica/messages/accepted.rs @@ -2,7 +2,7 @@ use anyhow::Result; use epoxy_protocol::protocol; use universaldb::Transaction; -use crate::replica::{ballot, messages, utils}; +use crate::replica::ballot; // EPaxos Step 16 #[tracing::instrument(skip_all)] diff --git a/engine/packages/epoxy/src/replica/messages/pre_accept.rs b/engine/packages/epoxy/src/replica/messages/pre_accept.rs index 452f9d51ba..a365a2e1be 100644 --- a/engine/packages/epoxy/src/replica/messages/pre_accept.rs +++ b/engine/packages/epoxy/src/replica/messages/pre_accept.rs @@ -3,7 +3,7 @@ use epoxy_protocol::protocol; use std::cmp; use universaldb::Transaction; -use crate::replica::{ballot, messages, utils}; +use crate::replica::{ballot, utils}; #[tracing::instrument(skip_all)] pub async fn pre_accept( diff --git a/engine/packages/epoxy/src/workflows/coordinator/reconfigure.rs b/engine/packages/epoxy/src/workflows/coordinator/reconfigure.rs index 4d510805b8..fbc5ccca51 100644 --- a/engine/packages/epoxy/src/workflows/coordinator/reconfigure.rs +++ b/engine/packages/epoxy/src/workflows/coordinator/reconfigure.rs @@ -1,5 +1,5 @@ use anyhow::*; -use epoxy_protocol::protocol::{self, ReplicaId}; +use epoxy_protocol::protocol::{self}; use gas::prelude::*; use rivet_api_builder::ApiCtx; use serde::{Deserialize, Serialize}; diff --git a/engine/packages/epoxy/src/workflows/mod.rs b/engine/packages/epoxy/src/workflows/mod.rs index ecc638cdb8..c4270f75e9 100644 --- a/engine/packages/epoxy/src/workflows/mod.rs +++ b/engine/packages/epoxy/src/workflows/mod.rs @@ -1,2 +1,3 @@ pub mod coordinator; +pub mod purger; pub mod replica; diff --git a/engine/packages/epoxy/src/workflows/purger.rs b/engine/packages/epoxy/src/workflows/purger.rs new file mode 100644 index 0000000000..f68339b349 --- /dev/null +++ b/engine/packages/epoxy/src/workflows/purger.rs @@ -0,0 +1,81 @@ +use anyhow::*; +use base64::Engine; +use base64::engine::general_purpose::STANDARD as BASE64; +use epoxy_protocol::protocol; +use futures_util::FutureExt; +use gas::prelude::*; +use rivet_api_builder::ApiCtx; +use serde::{Deserialize, Serialize}; + +use crate::http_client; + +#[derive(Debug, Deserialize, Serialize)] +pub struct Input { + pub replica_id: protocol::ReplicaId, +} + +// HACK: This workflow is a hack used to implement token revoking. It should be replaced with proper snapshot +// reads +#[workflow] +pub async fn epoxy_purger(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> { + ctx.repeat(|ctx| { + let replica_id = input.replica_id; + + async move { + let sig = ctx.listen::().await?; + + ctx.activity(PurgeInput { + replica_id, + keys: sig.keys, + }) + .await?; + + Ok(Loop::<()>::Continue) + } + .boxed() + }) + .await?; + + Ok(()) +} + +#[signal("epoxy_purger_purge")] +pub struct Purge { + /// Base64 encoded keys. + pub keys: Vec, +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +struct PurgeInput { + replica_id: protocol::ReplicaId, + /// Base64 encoded keys. + keys: Vec, +} + +#[activity(PurgeActivity)] +#[max_retries = 18_446_744_073_709_551_615] // Retry forever +async fn send_purge(ctx: &ActivityCtx, input: &PurgeInput) -> Result<()> { + let config = ctx + .op(crate::ops::read_cluster_config::Input {}) + .await? + .config; + + http_client::send_message( + &ApiCtx::new_from_activity(&ctx)?, + &config, + protocol::Request { + from_replica_id: ctx.config().epoxy_replica_id(), + to_replica_id: input.replica_id, + kind: protocol::RequestKind::KvPurgeRequest(protocol::KvPurgeRequest { + keys: input + .keys + .iter() + .map(|key| BASE64.decode(key).context("invalid base64 key")) + .collect::>>()?, + }), + }, + ) + .await?; + + Ok(()) +} diff --git a/engine/packages/epoxy/src/workflows/replica/mod.rs b/engine/packages/epoxy/src/workflows/replica/mod.rs index 36a29cfbde..a284eab60b 100644 --- a/engine/packages/epoxy/src/workflows/replica/mod.rs +++ b/engine/packages/epoxy/src/workflows/replica/mod.rs @@ -1,5 +1,4 @@ use anyhow::*; -use epoxy_protocol::protocol; use futures_util::FutureExt; use gas::prelude::*; use serde::{Deserialize, Serialize}; diff --git a/engine/packages/epoxy/src/workflows/replica/setup.rs b/engine/packages/epoxy/src/workflows/replica/setup.rs index 911be58f67..6022f5547c 100644 --- a/engine/packages/epoxy/src/workflows/replica/setup.rs +++ b/engine/packages/epoxy/src/workflows/replica/setup.rs @@ -675,7 +675,6 @@ async fn recover_key_value_with_instances( committed_entries.push(CommittedEntry { instance: (*instance_replica_id, *instance_slot_id), entry: entry.clone(), - seq: entry.seq, deps: entry.deps.clone(), }); } @@ -729,7 +728,7 @@ async fn recover_key_value_with_instances( struct CommittedEntry { instance: (protocol::ReplicaId, protocol::SlotId), entry: protocol::LogEntry, - seq: u64, // Seq is u64 in protocol + // seq: u64, // Seq is u64 in protocol deps: Vec, } diff --git a/engine/packages/gasoline/src/builder/common/signal.rs b/engine/packages/gasoline/src/builder/common/signal.rs index 7d601ab861..86f03fa878 100644 --- a/engine/packages/gasoline/src/builder/common/signal.rs +++ b/engine/packages/gasoline/src/builder/common/signal.rs @@ -43,6 +43,7 @@ impl SignalBuilder { // TODO: Get rid of this // NOTE: This is a bad implementation because it disregards other errors that may have happened earlier + #[allow(non_snake_case)] pub fn bypass_signal_from_workflow_I_KNOW_WHAT_IM_DOING(mut self) -> Self { if let Some(BuilderError::CannotDispatchFromOpInWorkflow) = &self.error { self.error = None; diff --git a/engine/packages/gasoline/src/ctx/standalone.rs b/engine/packages/gasoline/src/ctx/standalone.rs index 25e08c796e..222e764618 100644 --- a/engine/packages/gasoline/src/ctx/standalone.rs +++ b/engine/packages/gasoline/src/ctx/standalone.rs @@ -46,9 +46,9 @@ impl StandaloneCtx { ) -> WorkflowResult { let ts = rivet_util::timestamp::now(); - let span = tracing::Span::current(); - span.record("req_id", req_id.to_string()); - span.record("ray_id", ray_id.to_string()); + tracing::Span::current() + .record("req_id", req_id.to_string()) + .record("ray_id", ray_id.to_string()); let msg_ctx = MessageCtx::new(&config, &pools, &cache, ray_id)?; diff --git a/engine/packages/gasoline/src/db/kv/mod.rs b/engine/packages/gasoline/src/db/kv/mod.rs index 9bc6c0c095..debb197da8 100644 --- a/engine/packages/gasoline/src/db/kv/mod.rs +++ b/engine/packages/gasoline/src/db/kv/mod.rs @@ -2532,7 +2532,7 @@ impl Database for DatabaseKv { Ok(()) }) - .custom_instrument(tracing::info_span!("commit_workflow_sleep_event_tx")) + .custom_instrument(tracing::info_span!("upsert_loop_event_tx")) .await .map_err(WorkflowError::Udb)?; @@ -2593,7 +2593,7 @@ impl Database for DatabaseKv { Ok(()) }) - .custom_instrument(tracing::info_span!("update_workflow_sleep_event_tx")) + .custom_instrument(tracing::info_span!("update_workflow_sleep_state_tx")) .await .map_err(WorkflowError::Udb)?; diff --git a/engine/packages/guard-core/src/custom_serve.rs b/engine/packages/guard-core/src/custom_serve.rs index 3d54fdaeaa..351747e96d 100644 --- a/engine/packages/guard-core/src/custom_serve.rs +++ b/engine/packages/guard-core/src/custom_serve.rs @@ -3,6 +3,8 @@ use async_trait::async_trait; use bytes::Bytes; use http_body_util::Full; use hyper::{Request, Response}; +use tokio_tungstenite::tungstenite::protocol::frame::CloseFrame; +use uuid::Uuid; use crate::WebSocketHandle; use crate::proxy_service::ResponseBody; @@ -25,5 +27,7 @@ pub trait CustomServeTrait: Send + Sync { headers: &hyper::HeaderMap, path: &str, request_context: &mut RequestContext, - ) -> Result<()>; + // Identifies the websocket across retries. + unique_request_id: Uuid, + ) -> Result>; } diff --git a/engine/packages/guard-core/src/errors.rs b/engine/packages/guard-core/src/errors.rs index f45b47c0ce..c40d17a8dc 100644 --- a/engine/packages/guard-core/src/errors.rs +++ b/engine/packages/guard-core/src/errors.rs @@ -81,6 +81,14 @@ pub struct ServiceUnavailable; )] pub struct WebSocketServiceUnavailable; +#[derive(RivetError, Serialize, Deserialize)] +#[error("guard", "websocket_service_retry", "WebSocket service retry.")] +pub struct WebSocketServiceRetry; + +#[derive(RivetError, Serialize, Deserialize)] +#[error("guard", "websocket_service_timeout", "WebSocket service timed out.")] +pub struct WebSocketServiceTimeout; + #[derive(RivetError, Serialize, Deserialize)] #[error( "guard", diff --git a/engine/packages/guard-core/src/proxy_service.rs b/engine/packages/guard-core/src/proxy_service.rs index f4498099f3..cacc8be35a 100644 --- a/engine/packages/guard-core/src/proxy_service.rs +++ b/engine/packages/guard-core/src/proxy_service.rs @@ -28,14 +28,19 @@ use tokio_tungstenite::tungstenite::{ }; use tracing::Instrument; use url::Url; +use uuid::Uuid; use crate::{ WebSocketHandle, custom_serve::CustomServeTrait, errors, metrics, request_context::RequestContext, }; +const X_RIVET_TARGET: HeaderName = HeaderName::from_static("x-rivet-target"); +const X_RIVET_ACTOR: HeaderName = HeaderName::from_static("x-rivet-actor"); +const X_RIVET_TOKEN: HeaderName = HeaderName::from_static("x-rivet-token"); pub const X_FORWARDED_FOR: HeaderName = HeaderName::from_static("x-forwarded-for"); pub const X_RIVET_ERROR: HeaderName = HeaderName::from_static("x-rivet-error"); + const ROUTE_CACHE_TTL: Duration = Duration::from_secs(60 * 10); // 10 minutes const PROXY_STATE_CACHE_TTL: Duration = Duration::from_secs(60 * 60); // 1 hour const WEBSOCKET_CLOSE_LINGER: Duration = Duration::from_millis(100); // Keep TCP connection open briefly after WebSocket close @@ -879,7 +884,7 @@ impl ProxyService { match res { Ok(resp) => { // Check if this is a retryable response - if should_retry(resp.status(), resp.headers()) { + if should_retry_request_inner(resp.status(), resp.headers()) { // Request connect error, might retry tracing::debug!( "Request attempt {attempts} failed (service unavailable)" @@ -1017,10 +1022,10 @@ impl ProxyService { while attempts < max_attempts { attempts += 1; - let resp = handler + let res = handler .handle_request(req_collected.clone(), request_context) - .await?; - if should_retry(resp.status(), resp.headers()) { + .await; + if should_retry_request(&res) { // Request connect error, might retry tracing::debug!("Request attempt {attempts} failed (service unavailable)"); @@ -1047,7 +1052,7 @@ impl ProxyService { continue; } - return Ok(resp); + return res; } // If we get here, all attempts failed @@ -1059,6 +1064,7 @@ impl ProxyService { } } + /// Modifies the incoming request before it is proxied. fn proxied_request_builder( &self, req_parts: &hyper::http::request::Parts, @@ -1088,13 +1094,16 @@ impl ProxyService { .method(req_parts.method.clone()) .uri(url.to_string()); - // Add proxy headers - { - let headers = builder - .headers_mut() - .expect("request builder unexpectedly in error state"); - add_proxy_headers_with_addr(headers, &req_parts.headers, self.remote_addr)?; - } + // Modify proxy headers + let headers = builder + .headers_mut() + .expect("request builder unexpectedly in error state"); + + headers.remove(X_RIVET_TARGET); + headers.remove(X_RIVET_ACTOR); + headers.remove(X_RIVET_TOKEN); + + add_proxy_headers_with_addr(headers, &req_parts.headers, self.remote_addr)?; Ok(builder) } @@ -1171,7 +1180,7 @@ impl ProxyService { } // Handle WebSocket upgrade properly with hyper_tungstenite - tracing::debug!("Upgrading client connection to WebSocket"); + tracing::debug!(%req_path, "Upgrading client connection to WebSocket"); let (client_response, client_ws) = match hyper_tungstenite::upgrade(req, None) { Ok(x) => { tracing::debug!("Client WebSocket upgrade successful"); @@ -1782,18 +1791,20 @@ impl ProxyService { } ResolveRouteOutput::Response(_) => unreachable!(), ResolveRouteOutput::CustomServe(mut handlers) => { - tracing::debug!("Spawning task to handle WebSocket communication"); + tracing::debug!(%req_path, "Spawning task to handle WebSocket communication"); let mut request_context = request_context.clone(); let req_headers = req_headers.clone(); let req_path = req_path.clone(); let req_host = req_host.clone(); - // TODO: Handle errors here, the error message is lost tokio::spawn( async move { + let request_id = Uuid::new_v4(); let mut attempts = 0u32; - let ws_handle = WebSocketHandle::new(client_ws); + let ws_handle = WebSocketHandle::new(client_ws) + .await + .context("failed initiating websocket handle")?; loop { match handlers @@ -1802,19 +1813,15 @@ impl ProxyService { &req_headers, &req_path, &mut request_context, + request_id, ) .await { - Ok(()) => { + Ok(close_frame) => { tracing::debug!("websocket handler complete, closing"); // Send graceful close - ws_handle - .send(to_hyper_close(Some(CloseFrame { - code: CloseCode::Normal, - reason: "".into(), - }))) - .await?; + ws_handle.send(to_hyper_close(close_frame)).await?; // Flush to ensure close frame is sent ws_handle.flush().await?; @@ -1825,13 +1832,32 @@ impl ProxyService { break; } Err(err) => { - attempts += 1; - if attempts > max_attempts || !is_retryable_ws_error(&err) { + tracing::debug!(?err, "websocket handler error"); + + // Denotes that the connection did not fail, but needs to be retried to + // resole a new target + let ws_retry = is_ws_retry(&err); + + if ws_retry { + attempts = 0; + } else { + attempts += 1; + } + + if attempts > max_attempts + || (!is_retryable_ws_error(&err) && !ws_retry) + { + tracing::debug!( + ?attempts, + ?max_attempts, + "WebSocket failed" + ); + // Close WebSocket with error ws_handle - .accept_and_send(to_hyper_close(Some( - err_to_close_frame(err, ray_id), - ))) + .send(to_hyper_close(Some(err_to_close_frame( + err, ray_id, + )))) .await?; // Flush to ensure close frame is sent @@ -1842,11 +1868,19 @@ impl ProxyService { break; } else { - let backoff = ProxyService::calculate_backoff( - attempts, - initial_interval, - ); - tokio::time::sleep(backoff).await; + if !ws_retry { + let backoff = ProxyService::calculate_backoff( + attempts, + initial_interval, + ); + + tracing::debug!( + ?backoff, + "WebSocket attempt {attempts} failed (service unavailable)" + ); + + tokio::time::sleep(backoff).await; + } match state .resolve_route( @@ -1864,11 +1898,9 @@ impl ProxyService { } Ok(ResolveRouteOutput::Response(response)) => { ws_handle - .accept_and_send(to_hyper_close(Some( - str_to_close_frame( - response.message.as_ref(), - ), - ))) + .send(to_hyper_close(Some(str_to_close_frame( + response.message.as_ref(), + )))) .await?; // Flush to ensure close frame is sent @@ -1879,12 +1911,10 @@ impl ProxyService { } Ok(ResolveRouteOutput::Target(_)) => { ws_handle - .accept_and_send(to_hyper_close(Some( - err_to_close_frame( - errors::WebSocketTargetChanged.build(), - ray_id, - ), - ))) + .send(to_hyper_close(Some(err_to_close_frame( + errors::WebSocketTargetChanged.build(), + ray_id, + )))) .await?; // Flush to ensure close frame is sent @@ -1897,9 +1927,9 @@ impl ProxyService { } Err(err) => { ws_handle - .accept_and_send(to_hyper_close(Some( - err_to_close_frame(err, ray_id), - ))) + .send(to_hyper_close(Some(err_to_close_frame( + err, ray_id, + )))) .await?; // Flush to ensure close frame is sent @@ -1947,13 +1977,17 @@ impl ProxyService { impl ProxyService { // Process an individual request - #[tracing::instrument(name = "guard_request", skip_all)] + #[tracing::instrument(name = "guard_request", skip_all, fields(ray_id, req_id))] pub async fn process(&self, mut req: Request) -> Result> { let start_time = Instant::now(); let request_ids = RequestIds::new(self.state.config.dc_label()); req.extensions_mut().insert(request_ids); + tracing::Span::current() + .record("req_id", request_ids.req_id.to_string()) + .record("ray_id", request_ids.ray_id.to_string()); + // Create request context for analytics tracking let mut request_context = RequestContext::new(self.state.clickhouse_inserter.clone(), request_ids); @@ -2063,35 +2097,50 @@ impl ProxyService { // If we receive an error during a websocket request, we attempt to open the websocket anyway // so we can send the error via websocket instead of http. Most websocket clients don't handle - // HTTP errors in a meaningful way for the user resulting in unhelpful errors + // HTTP errors in a meaningful way resulting in unhelpful errors for the user if is_websocket { tracing::debug!("Upgrading client connection to WebSocket for error proxy"); match hyper_tungstenite::upgrade(mock_req, None) { Ok((client_response, client_ws)) => { tracing::debug!("Client WebSocket upgrade for error proxy successful"); - tokio::spawn(async move { - let ws_handle = WebSocketHandle::new(client_ws); - let frame = err_to_close_frame(err, Some(request_ids.ray_id)); + tokio::spawn( + async move { + let ws_handle = match WebSocketHandle::new(client_ws).await { + Ok(ws_handle) => ws_handle, + Err(err) => { + tracing::debug!( + ?err, + "failed initiating websocket handle for error proxy" + ); + return; + } + }; + let frame = err_to_close_frame(err, Some(request_ids.ray_id)); - // Manual conversion to handle different tungstenite versions - let code_num: u16 = frame.code.into(); - let reason = frame.reason.clone(); + // Manual conversion to handle different tungstenite versions + let code_num: u16 = frame.code.into(); + let reason = frame.reason.clone(); - if let Err(err) = ws_handle - .accept_and_send( - tokio_tungstenite::tungstenite::Message::Close(Some( + if let Err(err) = ws_handle + .send(tokio_tungstenite::tungstenite::Message::Close(Some( tokio_tungstenite::tungstenite::protocol::CloseFrame { code: code_num.into(), reason, }, - )), - ) - .await - { - tracing::debug!(?err, "failed sending error proxy"); + ))) + .await + { + tracing::debug!( + ?err, + "failed sending websocket error proxy" + ); + } } - }); + .instrument( + tracing::info_span!("ws_error_proxy_task", ?request_ids.ray_id), + ), + ); // Return the response that will upgrade the client connection // For proper WebSocket handshaking, we need to preserve the original response @@ -2371,8 +2420,21 @@ fn err_into_response(err: anyhow::Error) -> Result> { .map_err(Into::into) } +fn should_retry_request(res: &Result>) -> bool { + match res { + Ok(resp) => should_retry_request_inner(resp.status(), resp.headers()), + Err(err) => { + if let Some(rivet_err) = err.chain().find_map(|x| x.downcast_ref::()) { + rivet_err.group() == "guard" && rivet_err.code() == "service_unavailable" + } else { + false + } + } + } +} + // Determine if a response should trigger a retry: 503 + x-rivet-error -fn should_retry(status: StatusCode, headers: &hyper::HeaderMap) -> bool { +fn should_retry_request_inner(status: StatusCode, headers: &hyper::HeaderMap) -> bool { status == StatusCode::SERVICE_UNAVAILABLE && headers.contains_key(X_RIVET_ERROR) } @@ -2385,6 +2447,14 @@ fn is_retryable_ws_error(err: &anyhow::Error) -> bool { } } +fn is_ws_retry(err: &anyhow::Error) -> bool { + if let Some(rivet_err) = err.chain().find_map(|x| x.downcast_ref::()) { + rivet_err.group() == "guard" && rivet_err.code() == "websocket_service_retry" + } else { + false + } +} + fn str_to_close_frame(err: &str) -> CloseFrame { // NOTE: reason cannot be more than 123 bytes as per the WS protocol spec let reason = rivet_util::safe_slice(err, 0, 123).into(); diff --git a/engine/packages/guard-core/src/websocket_handle.rs b/engine/packages/guard-core/src/websocket_handle.rs index bb17d2df3b..763f337b20 100644 --- a/engine/packages/guard-core/src/websocket_handle.rs +++ b/engine/packages/guard-core/src/websocket_handle.rs @@ -4,7 +4,6 @@ use hyper::upgrade::Upgraded; use hyper_tungstenite::HyperWebsocket; use hyper_tungstenite::tungstenite::Message as WsMessage; use hyper_util::rt::TokioIo; -use std::ops::Deref; use std::sync::Arc; use tokio::sync::Mutex; use tokio_tungstenite::WebSocketStream; @@ -14,104 +13,34 @@ pub type WebSocketReceiver = futures_util::stream::SplitStream>, WsMessage>; -enum WebSocketState { - Unaccepted { websocket: HyperWebsocket }, - Accepting, - Split { ws_tx: WebSocketSender }, -} - #[derive(Clone)] -pub struct WebSocketHandle(Arc); - -impl WebSocketHandle { - pub fn new(websocket: HyperWebsocket) -> Self { - Self(Arc::new(WebSocketHandleInner { - state: Mutex::new(WebSocketState::Unaccepted { websocket }), - })) - } +pub struct WebSocketHandle { + ws_tx: Arc>, + ws_rx: Arc>, } -impl Deref for WebSocketHandle { - type Target = WebSocketHandleInner; - - fn deref(&self) -> &Self::Target { - &*self.0 - } -} - -pub struct WebSocketHandleInner { - state: Mutex, -} +impl WebSocketHandle { + pub async fn new(websocket: HyperWebsocket) -> Result { + let ws_stream = websocket.await?; + let (ws_tx, ws_rx) = ws_stream.split(); -impl WebSocketHandleInner { - pub async fn accept(&self) -> Result { - let mut state = self.state.lock().await; - Self::accept_inner(&mut *state).await + Ok(Self { + ws_tx: Arc::new(Mutex::new(ws_tx)), + ws_rx: Arc::new(Mutex::new(ws_rx)), + }) } pub async fn send(&self, message: WsMessage) -> Result<()> { - let mut state = self.state.lock().await; - match &mut *state { - WebSocketState::Unaccepted { .. } | WebSocketState::Accepting => { - bail!("websocket has not been accepted"); - } - WebSocketState::Split { ws_tx } => { - ws_tx.send(message).await?; - Ok(()) - } - } - } - - pub async fn accept_and_send(&self, message: WsMessage) -> Result<()> { - let mut state = self.state.lock().await; - match &mut *state { - WebSocketState::Unaccepted { .. } => { - let _ = Self::accept_inner(&mut *state).await?; - let WebSocketState::Split { ws_tx } = &mut *state else { - bail!("websocket should be accepted"); - }; - ws_tx.send(message).await?; - Ok(()) - } - WebSocketState::Accepting => { - bail!("in accepting state") - } - WebSocketState::Split { ws_tx } => { - ws_tx.send(message).await?; - Ok(()) - } - } + self.ws_tx.lock().await.send(message).await?; + Ok(()) } pub async fn flush(&self) -> Result<()> { - let mut state = self.state.lock().await; - match &mut *state { - WebSocketState::Unaccepted { .. } | WebSocketState::Accepting => { - bail!("websocket has not been accepted"); - } - WebSocketState::Split { ws_tx } => { - ws_tx.flush().await?; - Ok(()) - } - } + self.ws_tx.lock().await.flush().await?; + Ok(()) } - async fn accept_inner(state: &mut WebSocketState) -> Result { - if !matches!(*state, WebSocketState::Unaccepted { .. }) { - bail!("websocket already accepted") - } - - // Accept websocket - let old_state = std::mem::replace(&mut *state, WebSocketState::Accepting); - let WebSocketState::Unaccepted { websocket } = old_state else { - bail!("should be in unaccepted state"); - }; - - // Accept WS - let ws_stream = websocket.await?; - let (ws_tx, ws_rx) = ws_stream.split(); - *state = WebSocketState::Split { ws_tx }; - - Ok(ws_rx) + pub fn recv(&self) -> Arc> { + self.ws_rx.clone() } } diff --git a/engine/packages/guard/Cargo.toml b/engine/packages/guard/Cargo.toml index e5832eb483..7da5a574f6 100644 --- a/engine/packages/guard/Cargo.toml +++ b/engine/packages/guard/Cargo.toml @@ -43,6 +43,7 @@ rustls.workspace = true serde_json.workspace = true serde.workspace = true tokio.workspace = true +tokio-tungstenite.workspace = true tracing.workspace = true universaldb.workspace = true universalpubsub.workspace = true diff --git a/engine/packages/guard/src/routing/api_public.rs b/engine/packages/guard/src/routing/api_public.rs index 43415122da..143db070da 100644 --- a/engine/packages/guard/src/routing/api_public.rs +++ b/engine/packages/guard/src/routing/api_public.rs @@ -9,6 +9,7 @@ use hyper::{Request, Response}; use rivet_guard_core::WebSocketHandle; use rivet_guard_core::proxy_service::{ResponseBody, RoutingOutput}; use rivet_guard_core::{CustomServeTrait, request_context::RequestContext}; +use tokio_tungstenite::tungstenite::protocol::frame::CloseFrame; use tower::Service; struct ApiPublicService { @@ -50,7 +51,8 @@ impl CustomServeTrait for ApiPublicService { _headers: &hyper::HeaderMap, _path: &str, _request_context: &mut RequestContext, - ) -> Result<()> { + _unique_request_id: Uuid, + ) -> Result> { bail!("api-public does not support WebSocket connections") } } diff --git a/engine/packages/guard/src/routing/pegboard_gateway.rs b/engine/packages/guard/src/routing/pegboard_gateway.rs index a7d6b7573d..c65bbc5c8a 100644 --- a/engine/packages/guard/src/routing/pegboard_gateway.rs +++ b/engine/packages/guard/src/routing/pegboard_gateway.rs @@ -10,9 +10,7 @@ use crate::{errors, shared_state::SharedState}; const ACTOR_READY_TIMEOUT: Duration = Duration::from_secs(10); pub const X_RIVET_ACTOR: HeaderName = HeaderName::from_static("x-rivet-actor"); -pub const X_RIVET_AMESPACE: HeaderName = HeaderName::from_static("x-rivet-namespace"); const WS_PROTOCOL_ACTOR: &str = "rivet_actor."; -const WS_PROTOCOL_TOKEN: &str = "rivet_token."; /// Route requests to actor services using path-based routing #[tracing::instrument(skip_all)] @@ -172,9 +170,8 @@ async fn route_request_inner( res = stopped_sub.next() => { res?; - // Attempt to rewake once - if wake_retries < 3 { - tracing::debug!(?actor_id, ?wake_retries, "actor stopped while we were waiting for it to beocme ready, attempting rewake"); + if wake_retries < 16 { + tracing::debug!(?actor_id, ?wake_retries, "actor stopped while we were waiting for it to become ready, attempting rewake"); wake_retries += 1; let res = ctx.signal(pegboard::workflows::actor::Wake {}) @@ -194,6 +191,9 @@ async fn route_request_inner( } else { res?; } + } else { + tracing::warn!("actor retried waking 16 times, has not yet started"); + return Err(rivet_guard_core::errors::ServiceUnavailable.build()); } } res = fail_sub.next() => { diff --git a/engine/packages/namespace/src/keys/runner_config.rs b/engine/packages/namespace/src/keys/runner_config.rs index 1d232805e6..7974a121ff 100644 --- a/engine/packages/namespace/src/keys/runner_config.rs +++ b/engine/packages/namespace/src/keys/runner_config.rs @@ -38,7 +38,7 @@ impl FormalKey for DataKey { } fn serialize(&self, value: Self::Value) -> Result> { - rivet_data::versioned::NamespaceRunnerConfig::latest(value.into()) + rivet_data::versioned::NamespaceRunnerConfig::wrap_latest(value.into()) .serialize_with_embedded_version(rivet_data::PEGBOARD_NAMESPACE_RUNNER_CONFIG_VERSION) } } @@ -129,7 +129,7 @@ impl FormalKey for ByVariantKey { } fn serialize(&self, value: Self::Value) -> Result> { - rivet_data::versioned::NamespaceRunnerConfig::latest(value.into()) + rivet_data::versioned::NamespaceRunnerConfig::wrap_latest(value.into()) .serialize_with_embedded_version(rivet_data::PEGBOARD_NAMESPACE_RUNNER_CONFIG_VERSION) } } diff --git a/engine/packages/pegboard-gateway/Cargo.toml b/engine/packages/pegboard-gateway/Cargo.toml index ec5d7df480..693bf8de57 100644 --- a/engine/packages/pegboard-gateway/Cargo.toml +++ b/engine/packages/pegboard-gateway/Cargo.toml @@ -12,6 +12,7 @@ bytes.workspace = true futures-util.workspace = true gas.workspace = true http-body-util.workspace = true +# TODO: Doesn't match workspace version hyper = "1.6" hyper-tungstenite.workspace = true pegboard.workspace = true @@ -20,7 +21,9 @@ rivet-error.workspace = true rivet-guard-core.workspace = true rivet-runner-protocol.workspace = true rivet-util.workspace = true +scc.workspace = true serde.workspace = true +serde_json.workspace = true thiserror.workspace = true tokio-tungstenite.workspace = true tokio.workspace = true diff --git a/engine/packages/pegboard-gateway/src/lib.rs b/engine/packages/pegboard-gateway/src/lib.rs index 230afa357c..5bbbd978b4 100644 --- a/engine/packages/pegboard-gateway/src/lib.rs +++ b/engine/packages/pegboard-gateway/src/lib.rs @@ -4,26 +4,47 @@ use bytes::Bytes; use futures_util::TryStreamExt; use gas::prelude::*; use http_body_util::{BodyExt, Full}; -use hyper::{Request, Response, StatusCode, header::HeaderName}; +use hyper::{Request, Response, StatusCode}; +use rivet_error::*; use rivet_guard_core::{ WebSocketHandle, custom_serve::CustomServeTrait, - errors::{ServiceUnavailable, WebSocketServiceUnavailable}, + errors::{ + ServiceUnavailable, WebSocketServiceRetry, WebSocketServiceTimeout, + WebSocketServiceUnavailable, + }, proxy_service::ResponseBody, request_context::RequestContext, }; use rivet_runner_protocol as protocol; use rivet_util::serde::HashableMap; use std::time::Duration; -use tokio_tungstenite::tungstenite::{Message, protocol::frame::coding::CloseCode}; +use tokio::sync::watch; +use tokio_tungstenite::tungstenite::{ + Message, + protocol::frame::{CloseFrame, coding::CloseCode}, +}; use crate::shared_state::{SharedState, TunnelMessageData}; pub mod shared_state; const TUNNEL_ACK_TIMEOUT: Duration = Duration::from_secs(2); -const SEC_WEBSOCKET_PROTOCOL: HeaderName = HeaderName::from_static("sec-websocket-protocol"); -const WS_PROTOCOL_ACTOR: &str = "rivet_actor."; + +#[derive(RivetError, Serialize, Deserialize)] +#[error( + "guard", + "websocket_pending_limit_reached", + "Reached limit on pending websocket messages, aborting connection." +)] +pub struct WebsocketPendingLimitReached; + +#[derive(Debug)] +enum LifecycleResult { + ServerClose(protocol::ToServerWebSocketClose), + ClientClose(Option), + Aborted, +} pub struct PegboardGateway { shared_state: SharedState, @@ -78,9 +99,10 @@ impl CustomServeTrait for PegboardGateway { pegboard::pubsub_subjects::RunnerReceiverSubject::new(self.runner_id).to_string(); // Start listening for request responses - let (request_id, mut msg_rx) = self + let request_id = Uuid::new_v4().into_bytes(); + let mut msg_rx = self .shared_state - .start_in_flight_request(tunnel_subject) + .start_in_flight_request(tunnel_subject, request_id) .await; // Start request @@ -111,6 +133,10 @@ impl CustomServeTrait for PegboardGateway { ) => { return anyhow::Ok(response_start); } + protocol::ToServerTunnelMessageKind::ToServerResponseAbort => { + tracing::warn!("request aborted"); + return Err(ServiceUnavailable.build()); + } _ => { tracing::warn!("received non-response message from pubsub"); } @@ -122,7 +148,7 @@ impl CustomServeTrait for PegboardGateway { } } - tracing::warn!("received no message response"); + tracing::warn!(request_id=?Uuid::from_bytes(request_id), "received no message response during request init"); Err(ServiceUnavailable.build()) }; let response_start = tokio::time::timeout(TUNNEL_ACK_TIMEOUT, fut) @@ -157,7 +183,8 @@ impl CustomServeTrait for PegboardGateway { headers: &hyper::HeaderMap, _path: &str, _request_context: &mut RequestContext, - ) -> Result<()> { + unique_request_id: Uuid, + ) -> Result> { // Use the actor ID from the gateway instance let actor_id = self.actor_id.to_string(); @@ -174,9 +201,10 @@ impl CustomServeTrait for PegboardGateway { pegboard::pubsub_subjects::RunnerReceiverSubject::new(self.runner_id).to_string(); // Start listening for WebSocket messages - let (request_id, mut msg_rx) = self + let request_id = unique_request_id.into_bytes(); + let mut msg_rx = self .shared_state - .start_in_flight_request(tunnel_subject.clone()) + .start_in_flight_request(tunnel_subject.clone(), request_id) .await; // Send WebSocket open message @@ -199,9 +227,9 @@ impl CustomServeTrait for PegboardGateway { while let Some(msg) = msg_rx.recv().await { match msg { TunnelMessageData::Message( - protocol::ToServerTunnelMessageKind::ToServerWebSocketOpen, + protocol::ToServerTunnelMessageKind::ToServerWebSocketOpen(msg), ) => { - return anyhow::Ok(()); + return anyhow::Ok(msg); } TunnelMessageData::Message( protocol::ToServerTunnelMessageKind::ToServerWebSocketClose(close), @@ -221,10 +249,11 @@ impl CustomServeTrait for PegboardGateway { } } - tracing::warn!("received no message response"); + tracing::warn!(request_id=?Uuid::from_bytes(request_id), "received no message response during ws init"); Err(WebSocketServiceUnavailable.build()) }; - tokio::time::timeout(TUNNEL_ACK_TIMEOUT, fut) + + let open_msg = tokio::time::timeout(TUNNEL_ACK_TIMEOUT, fut) .await .map_err(|_| { tracing::warn!("timed out waiting for tunnel ack"); @@ -232,120 +261,198 @@ impl CustomServeTrait for PegboardGateway { WebSocketServiceUnavailable.build() })??; - // Accept the WebSocket - let mut ws_rx = client_ws.accept().await?; + self.shared_state + .toggle_hibernation(request_id, open_msg.can_hibernate) + .await?; + + // Send reclaimed messages + self.shared_state + .resend_pending_websocket_messages(request_id, open_msg.last_msg_index) + .await?; + + let ws_rx = client_ws.recv(); - // Spawn task to forward messages from server to client - let mut server_to_client = tokio::spawn( + let (tunnel_to_ws_abort_tx, mut tunnel_to_ws_abort_rx) = watch::channel(()); + let (ws_to_tunnel_abort_tx, mut ws_to_tunnel_abort_rx) = watch::channel(()); + + // Spawn task to forward messages from tunnel to ws + let shared_state = self.shared_state.clone(); + let tunnel_to_ws = tokio::spawn( async move { - while let Some(msg) = msg_rx.recv().await { - match msg { - TunnelMessageData::Message( - protocol::ToServerTunnelMessageKind::ToServerWebSocketMessage(ws_msg), - ) => { - let msg = if ws_msg.binary { - Message::Binary(ws_msg.data.into()) + loop { + tokio::select! { + res = msg_rx.recv() => { + if let Some(msg) = res { + match msg { + TunnelMessageData::Message( + protocol::ToServerTunnelMessageKind::ToServerWebSocketMessage(ws_msg), + ) => { + let msg = if ws_msg.binary { + Message::Binary(ws_msg.data.into()) + } else { + Message::Text( + String::from_utf8_lossy(&ws_msg.data).into_owned().into(), + ) + }; + client_ws.send(msg).await?; + } + TunnelMessageData::Message( + protocol::ToServerTunnelMessageKind::ToServerWebSocketMessageAck(ack), + ) => { + shared_state + .ack_pending_websocket_messages(request_id, ack.index) + .await?; + } + TunnelMessageData::Message( + protocol::ToServerTunnelMessageKind::ToServerWebSocketClose(close), + ) => { + tracing::debug!(?close, "server closed websocket"); + + + if open_msg.can_hibernate && close.retry { + // Successful closure + return Err(WebSocketServiceRetry.build()); + } else { + return Ok(LifecycleResult::ServerClose(close)); + } + } + TunnelMessageData::Timeout => { + tracing::warn!("websocket message timeout"); + return Err(WebSocketServiceTimeout.build()); + } + _ => {} + } } else { - Message::Text( - String::from_utf8_lossy(&ws_msg.data).into_owned().into(), - ) - }; - client_ws.send(msg).await?; - } - TunnelMessageData::Message( - protocol::ToServerTunnelMessageKind::ToServerWebSocketClose(close), - ) => { - tracing::debug!(?close, "server closed websocket"); - return Err(WebSocketServiceUnavailable.build()); + tracing::debug!("tunnel sub closed"); + return Err(WebSocketServiceRetry.build()); + } } - TunnelMessageData::Timeout => { - tracing::warn!("websocket message timeout"); - return Err(WebSocketServiceUnavailable.build()); + _ = tunnel_to_ws_abort_rx.changed() => { + tracing::debug!("task aborted"); + return Ok(LifecycleResult::Aborted); } - _ => {} } } - - tracing::debug!("sub closed"); - - Err(WebSocketServiceUnavailable.build()) } - .instrument(tracing::info_span!("server_to_client_task")), + .instrument(tracing::info_span!("tunnel_to_ws_task")), ); - // Spawn task to forward messages from client to server + // Spawn task to forward messages from ws to tunnel let shared_state_clone = self.shared_state.clone(); - let mut client_to_server = tokio::spawn( + let ws_to_tunnel = tokio::spawn( async move { - while let Some(msg) = ws_rx.try_next().await? { - match msg { - Message::Binary(data) => { - let ws_message = - protocol::ToClientTunnelMessageKind::ToClientWebSocketMessage( - protocol::ToClientWebSocketMessage { - data: data.into(), - binary: true, - }, - ); - shared_state_clone - .send_message(request_id, ws_message) - .await?; - } - Message::Text(text) => { - let ws_message = - protocol::ToClientTunnelMessageKind::ToClientWebSocketMessage( - protocol::ToClientWebSocketMessage { - data: text.as_bytes().to_vec(), - binary: false, - }, - ); - shared_state_clone - .send_message(request_id, ws_message) - .await?; + let mut ws_rx = ws_rx.lock().await; + + loop { + tokio::select! { + res = ws_rx.try_next() => { + if let Some(msg) = res? { + match msg { + Message::Binary(data) => { + let ws_message = + protocol::ToClientTunnelMessageKind::ToClientWebSocketMessage( + protocol::ToClientWebSocketMessage { + // NOTE: This gets set in shared_state.ts + index: 0, + data: data.into(), + binary: true, + }, + ); + shared_state_clone + .send_message(request_id, ws_message) + .await?; + } + Message::Text(text) => { + let ws_message = + protocol::ToClientTunnelMessageKind::ToClientWebSocketMessage( + protocol::ToClientWebSocketMessage { + // NOTE: This gets set in shared_state.ts + index: 0, + data: text.as_bytes().to_vec(), + binary: false, + }, + ); + shared_state_clone + .send_message(request_id, ws_message) + .await?; + } + Message::Close(close) => { + return Ok(LifecycleResult::ClientClose(close)); + } + _ => {} + } + } else { + tracing::debug!("websocket stream closed"); + return Ok(LifecycleResult::ClientClose(None)); + } } - Message::Close(_) => { - return Ok(()); + _ = ws_to_tunnel_abort_rx.changed() => { + tracing::debug!("task aborted"); + return Ok(LifecycleResult::Aborted); } - _ => {} - } + }; } - - tracing::debug!("websocket stream closed"); - - Ok(()) } - .instrument(tracing::info_span!("client_to_server_task")), + .instrument(tracing::info_span!("ws_to_tunnel_task")), ); - // Wait for either task to complete - let lifecycle_res = tokio::select! { - res = &mut server_to_client => { - let res = res?; - tracing::debug!(?res, "server to client task completed"); + // Wait for both tasks to complete + let (tunnel_to_ws_res, ws_to_tunnel_res) = tokio::join!( + async { + let res = tunnel_to_ws.await?; + + // Abort other if not aborted + if !matches!(res, Ok(LifecycleResult::Aborted)) { + tracing::debug!(?res, "tunnel to ws task completed, aborting counterpart"); + + drop(ws_to_tunnel_abort_tx); + } else { + tracing::debug!(?res, "tunnel to ws task completed"); + } + res - } - res = &mut client_to_server => { - let res = res?; - tracing::debug!(?res, "client to server task completed"); + }, + async { + let res = ws_to_tunnel.await?; + + // Abort other if not aborted + if !matches!(res, Ok(LifecycleResult::Aborted)) { + tracing::debug!(?res, "ws to tunnel task completed, aborting counterpart"); + + drop(tunnel_to_ws_abort_tx); + } else { + tracing::debug!(?res, "ws to tunnel task completed"); + } + res } - }; - - // Abort remaining tasks - server_to_client.abort(); - client_to_server.abort(); + ); - let (close_code, close_reason) = if lifecycle_res.is_ok() { - (CloseCode::Normal.into(), None) - } else { - (CloseCode::Error.into(), Some("ws.downstream_closed".into())) + // Determine single result from both tasks + let mut lifecycle_res = match (tunnel_to_ws_res, ws_to_tunnel_res) { + // Prefer error + (_, Err(err)) => Err(err), + (Err(err), _) => Err(err), + // Prefer non aborted result if both succeed + (Ok(res), Ok(LifecycleResult::Aborted)) => Ok(res), + (Ok(LifecycleResult::Aborted), Ok(res)) => Ok(res), + // Prefer tunnel to ws if both succeed (unlikely case) + (res, _) => res, }; // Send WebSocket close message to runner + let (close_code, close_reason) = match &mut lifecycle_res { + // Taking here because it won't be used again + Ok(LifecycleResult::ClientClose(Some(close))) => { + (close.code, Some(std::mem::take(&mut close.reason))) + } + Ok(_) => (CloseCode::Normal.into(), None), + Err(_) => (CloseCode::Error.into(), Some("ws.downstream_closed".into())), + }; let close_message = protocol::ToClientTunnelMessageKind::ToClientWebSocketClose( protocol::ToClientWebSocketClose { - code: Some(close_code), - reason: close_reason, + code: Some(close_code.into()), + reason: close_reason.map(|x| x.as_str().to_string()), }, ); @@ -357,6 +464,20 @@ impl CustomServeTrait for PegboardGateway { tracing::error!(?err, "error sending close message"); } - lifecycle_res + // Send WebSocket close message to client + match lifecycle_res { + Ok(LifecycleResult::ServerClose(close)) => { + if let Some(code) = close.code { + Ok(Some(CloseFrame { + code: code.into(), + reason: close.reason.unwrap_or_default().into(), + })) + } else { + Ok(None) + } + } + Ok(_) => Ok(None), + Err(err) => Err(err), + } } } diff --git a/engine/packages/pegboard-gateway/src/shared_state.rs b/engine/packages/pegboard-gateway/src/shared_state.rs index 7d93e4e93d..08abdf8aca 100644 --- a/engine/packages/pegboard-gateway/src/shared_state.rs +++ b/engine/packages/pegboard-gateway/src/shared_state.rs @@ -1,18 +1,26 @@ use anyhow::Result; use gas::prelude::*; use rivet_runner_protocol::{self as protocol, MessageId, PROTOCOL_VERSION, RequestId, versioned}; +use scc::{HashMap, hash_map::Entry}; use std::{ - collections::HashMap, ops::Deref, sync::Arc, time::{Duration, Instant}, }; -use tokio::sync::{Mutex, mpsc}; +use tokio::sync::mpsc; use universalpubsub::{NextOutput, PubSub, PublishOpts, Subscriber}; use vbare::OwnedVersionedData; -const GC_INTERVAL: Duration = Duration::from_secs(60); -const MESSAGE_ACK_TIMEOUT: Duration = Duration::from_secs(5); +use crate::WebsocketPendingLimitReached; + +const GC_INTERVAL: Duration = Duration::from_secs(15); +const MESSAGE_ACK_TIMEOUT: Duration = Duration::from_secs(30); +const MAX_PENDING_MSGS_SIZE_PER_REQ: u64 = util::size::mebibytes(1); + +pub enum TunnelMessageData { + Message(protocol::ToServerTunnelMessageKind), + Timeout, +} struct InFlightRequest { /// UPS subject to send messages to for this request. @@ -21,23 +29,30 @@ struct InFlightRequest { msg_tx: mpsc::Sender, /// True once first message for this request has been sent (so runner learned reply_to). opened: bool, + pending_msgs: Vec, + hibernation_state: Option, } -struct PendingMessage { - request_id: RequestId, +pub struct PendingMessage { + message_id: MessageId, send_instant: Instant, } -pub enum TunnelMessageData { - Message(protocol::ToServerTunnelMessageKind), - Timeout, +struct HibernationState { + total_pending_ws_msgs_size: u64, + last_ws_msg_index: u16, + pending_ws_msgs: Vec, +} + +pub struct PendingWebsocketMessage { + payload: Vec, + send_instant: Instant, } pub struct SharedStateInner { ups: PubSub, receiver_subject: String, - requests_in_flight: Mutex>, - pending_messages: Mutex>, + in_flight_requests: HashMap, } #[derive(Clone)] @@ -52,8 +67,7 @@ impl SharedState { Self(Arc::new(SharedStateInner { ups, receiver_subject, - requests_in_flight: Mutex::new(HashMap::new()), - pending_messages: Mutex::new(HashMap::new()), + in_flight_requests: HashMap::new(), })) } @@ -69,44 +83,67 @@ impl SharedState { Ok(()) } + pub async fn start_in_flight_request( + &self, + receiver_subject: String, + request_id: RequestId, + ) -> mpsc::Receiver { + let (msg_tx, msg_rx) = mpsc::channel(128); + + match self.in_flight_requests.entry_async(request_id).await { + Entry::Vacant(entry) => { + entry.insert_entry(InFlightRequest { + receiver_subject, + msg_tx, + opened: false, + pending_msgs: Vec::new(), + hibernation_state: None, + }); + } + Entry::Occupied(mut entry) => { + entry.receiver_subject = receiver_subject; + entry.msg_tx = msg_tx; + entry.opened = false; + entry.pending_msgs.clear(); + } + } + + msg_rx + } + pub async fn send_message( &self, request_id: RequestId, - message_kind: protocol::ToClientTunnelMessageKind, + mut message_kind: protocol::ToClientTunnelMessageKind, ) -> Result<()> { let message_id = Uuid::new_v4().as_bytes().clone(); - // Get subject and whether this is the first message for this request - let (tunnel_receiver_subject, include_reply_to) = { - let mut requests_in_flight = self.requests_in_flight.lock().await; - if let Some(req) = requests_in_flight.get_mut(&request_id) { - let receiver_subject = req.receiver_subject.clone(); - let include_reply_to = !req.opened; - if include_reply_to { - // Mark as opened so subsequent messages skip reply_to - req.opened = true; - } - (receiver_subject, include_reply_to) - } else { - bail!("request not in flight") - } - }; + let mut req = self + .in_flight_requests + .get_async(&request_id) + .await + .context("request not in flight")?; - // Save pending message - { - let mut pending_messages = self.pending_messages.lock().await; - pending_messages.insert( - message_id, - PendingMessage { - request_id, - send_instant: Instant::now(), - }, - ); + let include_reply_to = !req.opened; + if include_reply_to { + // Mark as opened so subsequent messages skip reply_to + req.opened = true; } - // Send message - let message = protocol::ToClient::ToClientTunnelMessage(protocol::ToClientTunnelMessage { - request_id, + let ws_msg_index = + if let (Some(hs), protocol::ToClientTunnelMessageKind::ToClientWebSocketMessage(msg)) = + (&req.hibernation_state, &mut message_kind) + { + // TODO: This ends up skipping 0 as an index when initiated but whatever + msg.index = hs.last_ws_msg_index.wrapping_add(1); + + Some(msg.index) + } else { + None + }; + + let payload = protocol::ToClientTunnelMessage { + request_id: request_id.clone(), message_id, // Only send reply to subject on the first message for this request. This reduces // overhead of subsequent messages. @@ -116,12 +153,41 @@ impl SharedState { None }, message_kind, + }; + + let now = Instant::now(); + req.pending_msgs.push(PendingMessage { + message_id, + send_instant: now, }); - let message_serialized = versioned::ToClient::latest(message) + + // Send message + let message = protocol::ToClient::ToClientTunnelMessage(payload); + let message_serialized = versioned::ToClient::wrap_latest(message) .serialize_with_embedded_version(PROTOCOL_VERSION)?; + + if let (Some(hs), Some(ws_msg_index)) = (&mut req.hibernation_state, ws_msg_index) { + hs.total_pending_ws_msgs_size += message_serialized.len() as u64; + + if hs.total_pending_ws_msgs_size > MAX_PENDING_MSGS_SIZE_PER_REQ + || hs.pending_ws_msgs.len() >= u16::MAX as usize + { + return Err(WebsocketPendingLimitReached {}.build()); + } + + hs.last_ws_msg_index = ws_msg_index; + + let pending_ws_msg = PendingWebsocketMessage { + payload: message_serialized.clone(), + send_instant: now, + }; + + hs.pending_ws_msgs.push(pending_ws_msg); + } + self.ups .publish( - &tunnel_receiver_subject, + &req.receiver_subject, &message_serialized, PublishOpts::one(), ) @@ -130,23 +196,6 @@ impl SharedState { Ok(()) } - pub async fn start_in_flight_request( - &self, - receiver_subject: String, - ) -> (RequestId, mpsc::Receiver) { - let id = Uuid::new_v4().into_bytes(); - let (msg_tx, msg_rx) = mpsc::channel(128); - self.requests_in_flight.lock().await.insert( - id, - InFlightRequest { - receiver_subject, - msg_tx, - opened: false, - }, - ); - (id, msg_rx) - } - async fn receiver(&self, mut sub: Subscriber) { while let Ok(NextOutput::Message(msg)) = sub.next().await { tracing::trace!( @@ -157,31 +206,37 @@ impl SharedState { match versioned::ToGateway::deserialize_with_embedded_version(&msg.payload) { Ok(protocol::ToGateway { message: msg }) => { tracing::debug!( - ?msg.request_id, - ?msg.message_id, + request_id=?Uuid::from_bytes(msg.request_id), + message_id=?Uuid::from_bytes(msg.message_id), "successfully deserialized message" ); + + let Some(mut in_flight) = + self.in_flight_requests.get_async(&msg.request_id).await + else { + tracing::debug!( + request_id=?Uuid::from_bytes(msg.request_id), + "in flight has already been disconnected" + ); + continue; + }; + if let protocol::ToServerTunnelMessageKind::TunnelAck = &msg.message_kind { - // Handle ack message + let prev_len = in_flight.pending_msgs.len(); + + in_flight + .pending_msgs + .retain(|m| m.message_id != msg.message_id); - let mut pending_messages = self.pending_messages.lock().await; - if pending_messages.remove(&msg.message_id).is_none() { + if prev_len == in_flight.pending_msgs.len() { tracing::warn!( "pending message does not exist or ack received after message body" - ); + ) } } else { // Send message to the request handler to emulate the real network action - let requests_in_flight = self.requests_in_flight.lock().await; - let Some(in_flight) = requests_in_flight.get(&msg.request_id) else { - tracing::debug!( - ?msg.request_id, - "in flight has already been disconnected" - ); - continue; - }; tracing::debug!( - ?msg.request_id, + request_id=?Uuid::from_bytes(msg.request_id), "forwarding message to request handler" ); let _ = in_flight @@ -200,15 +255,16 @@ impl SharedState { message_kind: protocol::ToClientTunnelMessageKind::TunnelAck, }, ); - let ack_message_serialized = match versioned::ToClient::latest(ack_message) - .serialize_with_embedded_version(PROTOCOL_VERSION) - { - Ok(x) => x, - Err(err) => { - tracing::error!(?err, "failed to serialize ack"); - continue; - } - }; + let ack_message_serialized = + match versioned::ToClient::wrap_latest(ack_message) + .serialize_with_embedded_version(PROTOCOL_VERSION) + { + Ok(x) => x, + Err(err) => { + tracing::error!(?err, "failed to serialize ack"); + continue; + } + }; tokio::spawn(async move { if let Err(err) = ups_clone .publish( @@ -230,46 +286,158 @@ impl SharedState { } } + pub async fn toggle_hibernation(&self, request_id: RequestId, enable: bool) -> Result<()> { + let mut req = self + .in_flight_requests + .get_async(&request_id) + .await + .context("request not in flight")?; + + match (req.hibernation_state.is_some(), enable) { + (true, true) => {} + (true, false) => req.hibernation_state = None, + (false, true) => { + req.hibernation_state = Some(HibernationState { + total_pending_ws_msgs_size: 0, + last_ws_msg_index: 0, + pending_ws_msgs: Vec::new(), + }); + } + (false, false) => {} + } + + Ok(()) + } + + pub async fn resend_pending_websocket_messages( + &self, + request_id: RequestId, + last_msg_index: i64, + ) -> Result<()> { + let Some(mut req) = self.in_flight_requests.get_async(&request_id).await else { + bail!("request not in flight"); + }; + + let receiver_subject = req.receiver_subject.clone(); + + if let Some(hs) = &mut req.hibernation_state { + if !hs.pending_ws_msgs.is_empty() { + tracing::debug!(request_id=?Uuid::from_bytes(request_id.clone()), len=?hs.pending_ws_msgs.len(), ?last_msg_index, "resending pending messages"); + + let len = hs.pending_ws_msgs.len().try_into()?; + + for (iter_index, pending_msg) in hs.pending_ws_msgs.iter().enumerate() { + let msg_index = hs + .last_ws_msg_index + .wrapping_sub(len) + .wrapping_add(1) + .wrapping_add(iter_index.try_into()?); + + if last_msg_index < 0 || wrapping_gt(msg_index, last_msg_index.try_into()?) { + self.ups + .publish(&receiver_subject, &pending_msg.payload, PublishOpts::one()) + .await?; + } + } + + // Perform ack + if last_msg_index >= 0 { + let last_msg_index = last_msg_index.try_into()?; + let mut iter_index = 0; + + hs.pending_ws_msgs.retain(|_| { + let msg_index = hs + .last_ws_msg_index + .wrapping_sub(len) + .wrapping_add(1) + .wrapping_add(iter_index); + let keep = wrapping_gt(msg_index, last_msg_index); + + iter_index += 1; + + keep + }); + + if hs.pending_ws_msgs.is_empty() { + hs.last_ws_msg_index = last_msg_index; + } + } + } + } + + Ok(()) + } + + pub async fn ack_pending_websocket_messages( + &self, + request_id: RequestId, + ack_index: u16, + ) -> Result<()> { + let Some(mut req) = self.in_flight_requests.get_async(&request_id).await else { + bail!("request not in flight"); + }; + + let Some(hs) = &mut req.hibernation_state else { + tracing::warn!("cannot ack ws messages, hibernation is not enabled"); + return Ok(()); + }; + + let len = hs.pending_ws_msgs.len().try_into()?; + let mut iter_index = 0u16; + hs.pending_ws_msgs.retain(|_| { + let msg_index = hs + .last_ws_msg_index + .wrapping_sub(len) + .wrapping_add(1) + .wrapping_add(iter_index); + let keep = wrapping_gt(msg_index, ack_index); + + iter_index += 1; + + keep + }); + + Ok(()) + } + async fn gc(&self) { let mut interval = tokio::time::interval(GC_INTERVAL); + interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + loop { interval.tick().await; let now = Instant::now(); - // Purge unacked messages - { - let mut pending_messages = self.pending_messages.lock().await; - let mut removed_req_ids = Vec::new(); - pending_messages.retain(|_k, v| { - if now.duration_since(v.send_instant) > MESSAGE_ACK_TIMEOUT { - // Expired - removed_req_ids.push(v.request_id.clone()); - false - } else { - true + self.in_flight_requests + .retain_async(|_, req| { + if req.msg_tx.is_closed() { + return false; } - }); - // Close in-flight messages - let requests_in_flight = self.requests_in_flight.lock().await; - for req_id in removed_req_ids { - if let Some(x) = requests_in_flight.get(&req_id) { - let _ = x.msg_tx.send(TunnelMessageData::Timeout); - } else { - tracing::warn!( - ?req_id, - "message expired for in flight that does not exist" - ); + let mut keep = true; + + if let Some(earliest_pending_msg) = req.pending_msgs.first() { + keep = now.duration_since(earliest_pending_msg.send_instant) + > MESSAGE_ACK_TIMEOUT; } - } - } - // Purge no longer in flight - { - let mut requests_in_flight = self.requests_in_flight.lock().await; - requests_in_flight.retain(|_k, v| !v.msg_tx.is_closed()); - } + if let Some(hs) = &req.hibernation_state { + if let (true, Some(earliest_pending_ws_msg)) = + (keep, hs.pending_ws_msgs.first()) + { + keep = now.duration_since(earliest_pending_ws_msg.send_instant) + > MESSAGE_ACK_TIMEOUT; + } + } + + if !keep { + let _ = req.msg_tx.send(TunnelMessageData::Timeout); + } + + keep + }) + .await; } } } @@ -281,3 +449,11 @@ impl Deref for SharedState { &self.0 } } + +fn wrapping_gt(a: u16, b: u16) -> bool { + a != b && a.wrapping_sub(b) < u16::MAX / 2 +} + +// fn wrapping_lt(a: u16, b: u16) -> bool { +// b.wrapping_sub(a) < u16::MAX / 2 +// } diff --git a/engine/packages/pegboard-runner/src/client_to_pubsub_task.rs b/engine/packages/pegboard-runner/src/client_to_pubsub_task.rs index 99b9ec6e1c..8b72bb69ff 100644 --- a/engine/packages/pegboard-runner/src/client_to_pubsub_task.rs +++ b/engine/packages/pegboard-runner/src/client_to_pubsub_task.rs @@ -8,14 +8,22 @@ use pegboard_actor_kv as kv; use rivet_guard_core::websocket_handle::WebSocketReceiver; use rivet_runner_protocol::{self as protocol, PROTOCOL_VERSION, versioned}; use std::sync::{Arc, atomic::Ordering}; +use tokio::sync::Mutex; use universalpubsub::PublishOpts; use vbare::OwnedVersionedData; use crate::conn::Conn; #[tracing::instrument(skip_all, fields(runner_id=?conn.runner_id, workflow_id=?conn.workflow_id, protocol_version=%conn.protocol_version))] -pub async fn task(ctx: StandaloneCtx, conn: Arc, mut ws_rx: WebSocketReceiver) -> Result<()> { +pub async fn task( + ctx: StandaloneCtx, + conn: Arc, + ws_rx: Arc>, +) -> Result<()> { tracing::debug!("starting WebSocket to pubsub forwarding task"); + + let mut ws_rx = ws_rx.lock().await; + while let Some(msg) = ws_rx.try_next().await? { match msg { WsMessage::Binary(data) => { @@ -27,7 +35,7 @@ pub async fn task(ctx: StandaloneCtx, conn: Arc, mut ws_rx: WebSocketRecei // Parse message let msg = match versioned::ToServer::deserialize_version(&data, conn.protocol_version) - .and_then(|x| x.into_latest()) + .and_then(|x| x.unwrap_latest()) { Ok(x) => x, Err(err) => { @@ -87,7 +95,7 @@ async fn handle_message( let actor_id = match Id::parse(&req.actor_id) { Ok(actor_id) => actor_id, Err(err) => { - let res_msg = versioned::ToClient::latest( + let res_msg = versioned::ToClient::wrap_latest( protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse { request_id: req.request_id, data: protocol::KvResponseData::KvErrorResponse( @@ -124,16 +132,16 @@ async fn handle_message( // Verify actor belongs to this runner if !actor_belongs { - let res_msg = versioned::ToClient::latest(protocol::ToClient::ToClientKvResponse( - protocol::ToClientKvResponse { + let res_msg = versioned::ToClient::wrap_latest( + protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse { request_id: req.request_id, data: protocol::KvResponseData::KvErrorResponse( protocol::KvErrorResponse { message: "given actor does not belong to runner".to_string(), }, ), - }, - )); + }), + ); let res_msg_serialized = res_msg .serialize(conn.protocol_version) @@ -152,7 +160,7 @@ async fn handle_message( protocol::KvRequestData::KvGetRequest(body) => { let res = kv::get(&*ctx.udb()?, actor_id, body.keys).await; - let res_msg = versioned::ToClient::latest( + let res_msg = versioned::ToClient::wrap_latest( protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse { request_id: req.request_id, data: match res { @@ -196,7 +204,7 @@ async fn handle_message( ) .await; - let res_msg = versioned::ToClient::latest( + let res_msg = versioned::ToClient::wrap_latest( protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse { request_id: req.request_id, data: match res { @@ -230,7 +238,7 @@ async fn handle_message( protocol::KvRequestData::KvPutRequest(body) => { let res = kv::put(&*ctx.udb()?, actor_id, body.keys, body.values).await; - let res_msg = versioned::ToClient::latest( + let res_msg = versioned::ToClient::wrap_latest( protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse { request_id: req.request_id, data: match res { @@ -258,7 +266,7 @@ async fn handle_message( protocol::KvRequestData::KvDeleteRequest(body) => { let res = kv::delete(&*ctx.udb()?, actor_id, body.keys).await; - let res_msg = versioned::ToClient::latest( + let res_msg = versioned::ToClient::wrap_latest( protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse { request_id: req.request_id, data: match res { @@ -284,7 +292,7 @@ async fn handle_message( protocol::KvRequestData::KvDropRequest => { let res = kv::delete_all(&*ctx.udb()?, actor_id).await; - let res_msg = versioned::ToClient::latest( + let res_msg = versioned::ToClient::wrap_latest( protocol::ToClient::ToClientKvResponse(protocol::ToClientKvResponse { request_id: req.request_id, data: match res { @@ -360,7 +368,7 @@ async fn handle_tunnel_message( } // Publish message to UPS - let msg_serialized = versioned::ToGateway::latest(protocol::ToGateway { message: msg }) + let msg_serialized = versioned::ToGateway::wrap_latest(protocol::ToGateway { message: msg }) .serialize_with_embedded_version(PROTOCOL_VERSION) .context("failed to serialize tunnel message for gateway")?; ctx.ups() diff --git a/engine/packages/pegboard-runner/src/conn.rs b/engine/packages/pegboard-runner/src/conn.rs index 7649717cd5..198bf0fb53 100644 --- a/engine/packages/pegboard-runner/src/conn.rs +++ b/engine/packages/pegboard-runner/src/conn.rs @@ -4,7 +4,7 @@ use gas::prelude::Id; use gas::prelude::*; use hyper_tungstenite::tungstenite::Message; use pegboard::ops::runner::update_alloc_idx::{Action, RunnerEligibility}; -use rivet_guard_core::{WebSocketHandle, websocket_handle::WebSocketReceiver}; +use rivet_guard_core::WebSocketHandle; use rivet_runner_protocol as protocol; use rivet_runner_protocol::*; use std::{ @@ -20,6 +20,7 @@ use crate::{errors::WsError, utils::UrlData}; pub struct TunnelActiveRequest { /// Subject to send replies to. pub gateway_reply_to: String, + pub is_ws: bool, } pub struct Conn { @@ -42,7 +43,6 @@ pub struct Conn { pub async fn init_conn( ctx: &StandaloneCtx, ws_handle: WebSocketHandle, - ws_rx: &mut WebSocketReceiver, UrlData { protocol_version, namespace, @@ -59,6 +59,9 @@ pub async fn init_conn( tracing::debug!("new runner connection"); + let ws_rx = ws_handle.recv(); + let mut ws_rx = ws_rx.lock().await; + // Receive init packet let (runner_id, workflow_id) = if let Some(msg) = tokio::time::timeout(Duration::from_secs(5), ws_rx.next()) diff --git a/engine/packages/pegboard-runner/src/lib.rs b/engine/packages/pegboard-runner/src/lib.rs index 95b4a1591d..0864e32a50 100644 --- a/engine/packages/pegboard-runner/src/lib.rs +++ b/engine/packages/pegboard-runner/src/lib.rs @@ -11,7 +11,7 @@ use rivet_guard_core::{ }; use rivet_runner_protocol as protocol; use std::time::Duration; -use tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode; +use tokio_tungstenite::tungstenite::protocol::frame::{CloseFrame, coding::CloseCode}; use universalpubsub::PublishOpts; use vbare::OwnedVersionedData; @@ -61,7 +61,8 @@ impl CustomServeTrait for PegboardRunnerWsCustomServe { _headers: &hyper::HeaderMap, path: &str, _request_context: &mut RequestContext, - ) -> Result<()> { + _unique_request_id: Uuid, + ) -> Result> { // Get UPS let ups = self.ctx.ups().context("failed to get UPS instance")?; @@ -73,14 +74,8 @@ impl CustomServeTrait for PegboardRunnerWsCustomServe { tracing::debug!(?path, "tunnel ws connection established"); - // Accept WS - let mut ws_rx = ws_handle - .accept() - .await - .context("failed to accept WebSocket connection")?; - // Create connection - let conn = conn::init_conn(&self.ctx, ws_handle.clone(), &mut ws_rx, url_data) + let conn = conn::init_conn(&self.ctx, ws_handle.clone(), url_data) .await .context("failed to initialize runner connection")?; @@ -101,7 +96,7 @@ impl CustomServeTrait for PegboardRunnerWsCustomServe { let mut client_to_pubsub = tokio::spawn(client_to_pubsub_task::task( self.ctx.clone(), conn.clone(), - ws_rx, + ws_handle.recv(), )); // Update pings @@ -152,24 +147,30 @@ impl CustomServeTrait for PegboardRunnerWsCustomServe { // Send WebSocket close messages to all remaining active requests let active_requests = conn.tunnel_active_requests.lock().await; for (request_id, req) in &*active_requests { - let (close_code, close_reason) = if lifecycle_res.is_ok() { - (CloseCode::Normal.into(), None) + let close_msg_kind = if req.is_ws { + let (close_code, close_reason) = if lifecycle_res.is_ok() { + (CloseCode::Normal.into(), None) + } else { + (CloseCode::Error.into(), Some("ws.upstream_closed".into())) + }; + + protocol::ToServerTunnelMessageKind::ToServerWebSocketClose( + protocol::ToServerWebSocketClose { + code: Some(close_code), + reason: close_reason, + retry: true, + }, + ) } else { - (CloseCode::Error.into(), Some("ws.upstream_closed".into())) + protocol::ToServerTunnelMessageKind::ToServerResponseAbort }; - let close_message = protocol::ToServerTunnelMessage { request_id: request_id.clone(), message_id: Uuid::new_v4().into_bytes(), - message_kind: protocol::ToServerTunnelMessageKind::ToServerWebSocketClose( - protocol::ToServerWebSocketClose { - code: Some(close_code), - reason: close_reason, - }, - ), + message_kind: close_msg_kind, }; - let msg_serialized = protocol::versioned::ToGateway::latest(protocol::ToGateway { + let msg_serialized = protocol::versioned::ToGateway::wrap_latest(protocol::ToGateway { message: close_message.clone(), }) .serialize_with_embedded_version(protocol::PROTOCOL_VERSION) @@ -193,6 +194,6 @@ impl CustomServeTrait for PegboardRunnerWsCustomServe { } // This will determine the close frame sent back to the runner websocket - lifecycle_res + lifecycle_res.map(|_| None) } } diff --git a/engine/packages/pegboard-runner/src/pubsub_to_client_task.rs b/engine/packages/pegboard-runner/src/pubsub_to_client_task.rs index 9dc4179a2a..9d249b0659 100644 --- a/engine/packages/pegboard-runner/src/pubsub_to_client_task.rs +++ b/engine/packages/pegboard-runner/src/pubsub_to_client_task.rs @@ -37,25 +37,50 @@ pub async fn task(conn: Arc, mut sub: Subscriber) -> Result<()> { protocol::ToClient::ToClientClose => return Err(errors::WsError::Eviction.build()), // Handle tunnel messages protocol::ToClient::ToClientTunnelMessage(tunnel_msg) => { - // Save active request - // - // This will remove gateway_reply_to from the message since it does not need to be sent to the - // client - if let Some(reply_to) = tunnel_msg.gateway_reply_to.take() { - tracing::debug!(?tunnel_msg.request_id, ?reply_to, "creating active request"); - let mut active_requests = conn.tunnel_active_requests.lock().await; - active_requests.insert( - tunnel_msg.request_id, - TunnelActiveRequest { - gateway_reply_to: reply_to, - }, - ); - } - match tunnel_msg.message_kind { + protocol::ToClientTunnelMessageKind::ToClientRequestStart(_) => { + // Save active request + // + // This will remove gateway_reply_to from the message since it does not need to be sent to the + // client + if let Some(reply_to) = tunnel_msg.gateway_reply_to.take() { + tracing::debug!(request_id=?Uuid::from_bytes(tunnel_msg.request_id), ?reply_to, "creating active request"); + let mut active_requests = conn.tunnel_active_requests.lock().await; + active_requests.insert( + tunnel_msg.request_id, + TunnelActiveRequest { + gateway_reply_to: reply_to, + is_ws: false, + }, + ); + } + } + // If terminal, remove active request tracking + protocol::ToClientTunnelMessageKind::ToClientRequestAbort => { + tracing::debug!(request_id=?Uuid::from_bytes(tunnel_msg.request_id), "removing active conn due to close message"); + let mut active_requests = conn.tunnel_active_requests.lock().await; + active_requests.remove(&tunnel_msg.request_id); + } + protocol::ToClientTunnelMessageKind::ToClientWebSocketOpen(_) => { + // Save active request + // + // This will remove gateway_reply_to from the message since it does not need to be sent to the + // client + if let Some(reply_to) = tunnel_msg.gateway_reply_to.take() { + tracing::debug!(request_id=?Uuid::from_bytes(tunnel_msg.request_id), ?reply_to, "creating active request"); + let mut active_requests = conn.tunnel_active_requests.lock().await; + active_requests.insert( + tunnel_msg.request_id, + TunnelActiveRequest { + gateway_reply_to: reply_to, + is_ws: true, + }, + ); + } + } // If terminal, remove active request tracking protocol::ToClientTunnelMessageKind::ToClientWebSocketClose(_) => { - tracing::debug!(?tunnel_msg.request_id, "removing active conn due to close message"); + tracing::debug!(request_id=?Uuid::from_bytes(tunnel_msg.request_id), "removing active conn due to close message"); let mut active_requests = conn.tunnel_active_requests.lock().await; active_requests.remove(&tunnel_msg.request_id); } @@ -67,7 +92,7 @@ pub async fn task(conn: Arc, mut sub: Subscriber) -> Result<()> { // Forward raw message to WebSocket let serialized_msg = - match versioned::ToClient::latest(msg).serialize_version(conn.protocol_version) { + match versioned::ToClient::wrap_latest(msg).serialize_version(conn.protocol_version) { Result::Ok(x) => x, Err(err) => { tracing::error!(?err, "failed to serialize tunnel message"); diff --git a/engine/packages/pegboard-serverless/src/lib.rs b/engine/packages/pegboard-serverless/src/lib.rs index fb58597f64..a5125a90a1 100644 --- a/engine/packages/pegboard-serverless/src/lib.rs +++ b/engine/packages/pegboard-serverless/src/lib.rs @@ -383,7 +383,7 @@ async fn outbound_handler( } } Err(sse::Error::StreamEnded) => { - tracing::debug!("outbound req stopped early"); + tracing::debug!(?runner_id, "outbound req stopped early"); return Ok(()); } @@ -417,7 +417,7 @@ async fn outbound_handler( match event { Ok(sse::Event::Open) => {} Ok(sse::Event::Message(msg)) => { - tracing::debug!(%msg.data, "received outbound req message"); + tracing::debug!(%msg.data, ?runner_id, "received outbound req message"); // If runner_id is none at this point it means we did not send the stopping signal yet, so // send it now @@ -451,7 +451,7 @@ async fn outbound_handler( tokio::select! { res = wait_for_shutdown_fut => return res.map_err(Into::into), _ = tokio::time::sleep(DRAIN_GRACE_PERIOD) => { - tracing::debug!("reached drain grace period before runner shut down") + tracing::debug!(?runner_id, "reached drain grace period before runner shut down") } } @@ -463,15 +463,15 @@ async fn outbound_handler( publish_to_client_stop(ctx, runner_id).await?; } - tracing::debug!("outbound req stopped"); + tracing::debug!(?runner_id, "outbound req stopped"); Ok(()) } async fn drain_runner(ctx: &StandaloneCtx, runner_id: Id) -> Result<()> { let res = ctx - .signal(pegboard::workflows::runner::Forward { - inner: protocol::ToServer::ToServerStopping, + .signal(pegboard::workflows::runner::Stop { + reset_actor_rescheduling: true, }) .to_workflow::() .tag("runner_id", runner_id) @@ -501,7 +501,7 @@ async fn publish_to_client_stop(ctx: &StandaloneCtx, runner_id: Id) -> Result<() let receiver_subject = pegboard::pubsub_subjects::RunnerReceiverSubject::new(runner_id).to_string(); - let message_serialized = rivet_runner_protocol::versioned::ToClient::latest( + let message_serialized = rivet_runner_protocol::versioned::ToClient::wrap_latest( rivet_runner_protocol::ToClient::ToClientClose, ) .serialize_with_embedded_version(rivet_runner_protocol::PROTOCOL_VERSION)?; diff --git a/engine/packages/pegboard/src/keys/ns.rs b/engine/packages/pegboard/src/keys/ns.rs index 33dc7ddbcc..a23ed5d1cb 100644 --- a/engine/packages/pegboard/src/keys/ns.rs +++ b/engine/packages/pegboard/src/keys/ns.rs @@ -52,7 +52,7 @@ impl FormalKey for RunnerAllocIdxKey { } fn serialize(&self, value: Self::Value) -> Result> { - rivet_data::versioned::RunnerAllocIdxKeyData::latest(value.try_into()?) + rivet_data::versioned::RunnerAllocIdxKeyData::wrap_latest(value.try_into()?) .serialize_with_embedded_version( rivet_data::PEGBOARD_NAMESPACE_RUNNER_ALLOC_IDX_VERSION, ) @@ -570,7 +570,7 @@ impl FormalKey for ActorByKeyKey { } fn serialize(&self, value: Self::Value) -> Result> { - rivet_data::versioned::ActorByKeyKeyData::latest(value.try_into()?) + rivet_data::versioned::ActorByKeyKeyData::wrap_latest(value.try_into()?) .serialize_with_embedded_version(rivet_data::PEGBOARD_NAMESPACE_ACTOR_BY_KEY_VERSION) } } @@ -1186,7 +1186,7 @@ impl FormalKey for RunnerByKeyKey { } fn serialize(&self, value: Self::Value) -> Result> { - rivet_data::versioned::RunnerByKeyKeyData::latest(value.try_into()?) + rivet_data::versioned::RunnerByKeyKeyData::wrap_latest(value.try_into()?) .serialize_with_embedded_version(rivet_data::PEGBOARD_NAMESPACE_RUNNER_BY_KEY_VERSION) } } @@ -1247,7 +1247,7 @@ impl FormalKey for ActorNameKey { } fn serialize(&self, value: Self::Value) -> Result> { - rivet_data::versioned::ActorNameKeyData::latest(value.try_into()?) + rivet_data::versioned::ActorNameKeyData::wrap_latest(value.try_into()?) .serialize_with_embedded_version(rivet_data::PEGBOARD_NAMESPACE_ACTOR_NAME_VERSION) } } diff --git a/engine/packages/pegboard/src/keys/runner.rs b/engine/packages/pegboard/src/keys/runner.rs index fdc726f239..9c038b4a73 100644 --- a/engine/packages/pegboard/src/keys/runner.rs +++ b/engine/packages/pegboard/src/keys/runner.rs @@ -767,7 +767,7 @@ impl FormalChunkedKey for MetadataKey { fn split(&self, value: Self::Value) -> Result>> { Ok( - rivet_data::versioned::MetadataKeyData::latest(value.try_into()?) + rivet_data::versioned::MetadataKeyData::wrap_latest(value.try_into()?) .serialize_with_embedded_version(rivet_data::PEGBOARD_RUNNER_METADATA_VERSION)? .chunks(universaldb::utils::CHUNK_SIZE) .map(|x| x.to_vec()) diff --git a/engine/packages/pegboard/src/workflows/actor/mod.rs b/engine/packages/pegboard/src/workflows/actor/mod.rs index cae0baae00..e657fad85f 100644 --- a/engine/packages/pegboard/src/workflows/actor/mod.rs +++ b/engine/packages/pegboard/src/workflows/actor/mod.rs @@ -268,6 +268,7 @@ pub async fn pegboard_actor(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> Main::Lost(Lost { generation: state.generation, force_reschedule: false, + reset_rescheduling: false, }) } } else if let Some(alarm_ts) = state.alarm_ts { @@ -372,7 +373,7 @@ pub async fn pegboard_actor(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> protocol::ActorStateStopped { code, .. }, ) => { if let Some(res) = - handle_stopped(ctx, &input, state, Some(code), false, false) + handle_stopped(ctx, &input, state, Some(code), None) .await? { return Ok(Loop::Break(res)); @@ -393,7 +394,7 @@ pub async fn pegboard_actor(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> state.sleeping = false; state.will_wake = false; - match runtime::reschedule_actor(ctx, &input, state, false).await? { + match runtime::reschedule_actor(ctx, &input, state, false, false).await? { runtime::SpawnActorOutput::Allocated { .. } => {}, runtime::SpawnActorOutput::Sleep => { state.sleeping = true; @@ -434,7 +435,7 @@ pub async fn pegboard_actor(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> } if let Some(res) = - handle_stopped(ctx, &input, state, None, true, sig.force_reschedule).await? + handle_stopped(ctx, &input, state, None, Some(sig)).await? { return Ok(Loop::Break(res)); } @@ -493,10 +494,9 @@ async fn handle_stopped( input: &Input, state: &mut runtime::LifecycleState, code: Option, - lost: bool, - force_reschedule: bool, + lost_sig: Option, ) -> Result> { - tracing::debug!(?code, %force_reschedule, "actor stopped"); + tracing::debug!(?code, ?lost_sig, "actor stopped"); // Reset retry count on successful exit if let Some(protocol::StopCode::Ok) = code { @@ -541,7 +541,7 @@ async fn handle_stopped( } // Kill old actor if lost (just in case it ended up allocating) - if let (true, Some(old_runner_workflow_id)) = (lost, old_runner_workflow_id) { + if let (Some(_), Some(old_runner_workflow_id)) = (&lost_sig, old_runner_workflow_id) { ctx.signal(crate::workflows::runner::Command { inner: protocol::Command::CommandStopActor(protocol::CommandStopActor { actor_id: input.actor_id.to_string(), @@ -553,13 +553,24 @@ async fn handle_stopped( .await?; } + let (force_reschedule, reset_rescheduling) = if let Some(lost_sig) = &lost_sig { + (lost_sig.force_reschedule, lost_sig.reset_rescheduling) + } else { + (false, false) + }; + // Reschedule no matter what if force_reschedule { - match runtime::reschedule_actor(ctx, &input, state, true).await? { + match runtime::reschedule_actor(ctx, &input, state, true, reset_rescheduling).await? { runtime::SpawnActorOutput::Allocated { .. } => {} // NOTE: This should be unreachable because force_reschedule is true runtime::SpawnActorOutput::Sleep => { state.sleeping = true; + + ctx.activity(runtime::SetSleepingInput { + actor_id: input.actor_id, + }) + .await?; } runtime::SpawnActorOutput::Destroy => { // Destroyed early @@ -578,7 +589,9 @@ async fn handle_stopped( match (input.crash_policy, failed) { (CrashPolicy::Restart, true) => { - match runtime::reschedule_actor(ctx, &input, state, false).await? { + match runtime::reschedule_actor(ctx, &input, state, false, reset_rescheduling) + .await? + { runtime::SpawnActorOutput::Allocated { .. } => {} // NOTE: Its not possible for `SpawnActorOutput::Sleep` to be returned here, the crash // policy is `Restart`. @@ -608,7 +621,7 @@ async fn handle_stopped( return Ok(Some(runtime::LifecycleRes { generation: state.generation, - kill: lost, + kill: lost_sig.is_some(), })); } } @@ -617,7 +630,7 @@ async fn handle_stopped( else if state.will_wake { state.sleeping = false; - match runtime::reschedule_actor(ctx, &input, state, false).await? { + match runtime::reschedule_actor(ctx, &input, state, false, reset_rescheduling).await? { runtime::SpawnActorOutput::Allocated { .. } => {} runtime::SpawnActorOutput::Sleep => { state.sleeping = true; @@ -676,10 +689,14 @@ pub struct Event { #[signal("pegboard_actor_wake")] pub struct Wake {} +#[derive(Debug)] #[signal("pegboard_actor_lost")] pub struct Lost { pub generation: u32, + /// Immediately reschedules the actor regardless of its crash policy. pub force_reschedule: bool, + /// Resets the rescheduling retry count to 0. + pub reset_rescheduling: bool, } #[signal("pegboard_actor_destroy")] diff --git a/engine/packages/pegboard/src/workflows/actor/runtime.rs b/engine/packages/pegboard/src/workflows/actor/runtime.rs index e98eb5146a..f4d03e80a5 100644 --- a/engine/packages/pegboard/src/workflows/actor/runtime.rs +++ b/engine/packages/pegboard/src/workflows/actor/runtime.rs @@ -614,6 +614,7 @@ pub async fn reschedule_actor( input: &Input, state: &mut LifecycleState, force_reschedule: bool, + reset_rescheduling: bool, ) -> Result { tracing::debug!(actor_id=?input.actor_id, "rescheduling actor"); @@ -633,7 +634,7 @@ pub async fn reschedule_actor( }) .await?; - state.reschedule_state.retry_count = if reset { + state.reschedule_state.retry_count = if reset || reset_rescheduling { 0 } else { state.reschedule_state.retry_count + 1 @@ -726,7 +727,7 @@ struct CompareRetryInput { async fn compare_retry(ctx: &ActivityCtx, input: &CompareRetryInput) -> Result<(i64, bool)> { let now = util::timestamp::now(); - // If the last retry ts is more than RETRY_RESET_DURATION_MS, reset retry count + // If the last retry ts is more than RETRY_RESET_DURATION_MS ago, reset retry count Ok((now, input.last_retry_ts < now - RETRY_RESET_DURATION_MS)) } diff --git a/engine/packages/pegboard/src/workflows/actor/setup.rs b/engine/packages/pegboard/src/workflows/actor/setup.rs index 313bc3b369..136bad24a5 100644 --- a/engine/packages/pegboard/src/workflows/actor/setup.rs +++ b/engine/packages/pegboard/src/workflows/actor/setup.rs @@ -7,7 +7,7 @@ use super::State; use crate::{errors, keys}; -const MAX_INPUT_SIZE: usize = util::file_size::mebibytes(4) as usize; +const MAX_INPUT_SIZE: usize = util::size::mebibytes(4) as usize; #[derive(Debug, Clone, Serialize, Deserialize, Hash)] pub struct ValidateInput { diff --git a/engine/packages/pegboard/src/workflows/runner.rs b/engine/packages/pegboard/src/workflows/runner.rs index 610b9ac370..64f5594bbb 100644 --- a/engine/packages/pegboard/src/workflows/runner.rs +++ b/engine/packages/pegboard/src/workflows/runner.rs @@ -166,6 +166,7 @@ pub async fn pegboard_runner(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> for event in &events { if event.index <= state.last_event_idx { tracing::warn!(idx=%event.index, "event already received, ignoring"); + continue; } let actor_id = @@ -227,40 +228,7 @@ pub async fn pegboard_runner(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> ctx.activity(AckCommandsInput { last_command_idx }).await?; } protocol::ToServer::ToServerStopping => { - if !state.draining { - // The workflow will enter a draining state where it can still process signals if - // needed. After RUNNER_LOST_THRESHOLD_MS it will exit this loop and stop. - state.draining = true; - - // Can't parallelize these two activities, requires reading from state - ctx.activity(ClearDbInput { - runner_id: input.runner_id, - name: input.name.clone(), - key: input.key.clone(), - update_state: RunnerState::Draining, - }) - .await?; - - let actors = ctx - .activity(FetchRemainingActorsInput { - runner_id: input.runner_id, - }) - .await?; - - // Set all remaining actors to lost immediately - if !actors.is_empty() { - for (actor_id, generation) in &actors { - ctx.signal(crate::workflows::actor::Lost { - generation: *generation, - force_reschedule: false, - }) - .to_workflow::() - .tag("actor_id", actor_id) - .send() - .await?; - } - } - } + handle_stopping(ctx, &input, state, false).await?; } protocol::ToServer::ToServerPing(_) | protocol::ToServer::ToServerKvRequest(_) @@ -291,6 +259,7 @@ pub async fn pegboard_runner(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> // Because this is a race condition, we want the actor to reschedule // regardless of its crash policy force_reschedule: true, + reset_rescheduling: true, }) .to_workflow::() .tag("actor_id", actor_id) @@ -347,14 +316,17 @@ pub async fn pegboard_runner(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> .await?; } } + Some(Main::Stop(sig)) => { + handle_stopping(ctx, &input, state, sig.reset_actor_rescheduling).await?; + } None => { - if state.draining - || ctx - .activity(CheckExpiredInput { - runner_id: input.runner_id, - }) - .await? - { + let expired = ctx + .activity(CheckExpiredInput { + runner_id: input.runner_id, + }) + .await?; + + if state.draining || expired { return Ok(Loop::Break(())); } } @@ -386,6 +358,7 @@ pub async fn pegboard_runner(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> .signal(crate::workflows::actor::Lost { generation, force_reschedule: false, + reset_rescheduling: false, }) .to_workflow::() .tag("actor_id", actor_id) @@ -416,6 +389,51 @@ pub async fn pegboard_runner(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> Ok(()) } +async fn handle_stopping( + ctx: &mut WorkflowCtx, + input: &Input, + state: &mut LifecycleState, + reset_actor_rescheduling: bool, +) -> Result<()> { + if !state.draining { + // The workflow will enter a draining state where it can still process signals if + // needed. After RUNNER_LOST_THRESHOLD_MS it will exit this loop and stop. + state.draining = true; + + // Can't parallelize these two activities, requires reading from state + ctx.activity(ClearDbInput { + runner_id: input.runner_id, + name: input.name.clone(), + key: input.key.clone(), + update_state: RunnerState::Draining, + }) + .await?; + + let actors = ctx + .activity(FetchRemainingActorsInput { + runner_id: input.runner_id, + }) + .await?; + + // Set all remaining actors to lost immediately + if !actors.is_empty() { + for (actor_id, generation) in &actors { + ctx.signal(crate::workflows::actor::Lost { + generation: *generation, + force_reschedule: false, + reset_rescheduling: reset_actor_rescheduling, + }) + .to_workflow::() + .tag("actor_id", actor_id) + .send() + .await?; + } + } + } + + Ok(()) +} + #[derive(Debug, Serialize, Deserialize)] struct LifecycleState { draining: bool, @@ -1124,7 +1142,7 @@ async fn send_message_to_runner(ctx: &ActivityCtx, input: &SendMessageToRunnerIn let receiver_subject = crate::pubsub_subjects::RunnerReceiverSubject::new(input.runner_id).to_string(); - let message_serialized = versioned::ToClient::latest(input.message.clone()) + let message_serialized = versioned::ToClient::wrap_latest(input.message.clone()) .serialize_with_embedded_version(PROTOCOL_VERSION)?; ctx.ups()? @@ -1137,6 +1155,11 @@ async fn send_message_to_runner(ctx: &ActivityCtx, input: &SendMessageToRunnerIn #[signal("pegboard_runner_check_queue")] pub struct CheckQueue {} +#[signal("pegboard_runner_stop")] +pub struct Stop { + pub reset_actor_rescheduling: bool, +} + #[signal("pegboard_runner_command")] pub struct Command { pub inner: protocol::Command, @@ -1152,4 +1175,5 @@ join_signal!(Main { // Forwarded from the ws to this workflow Forward(Forward), CheckQueue, + Stop, }); diff --git a/engine/packages/universaldb/src/driver/rocksdb/transaction_task.rs b/engine/packages/universaldb/src/driver/rocksdb/transaction_task.rs index e43fbe6a1d..704c4f3823 100644 --- a/engine/packages/universaldb/src/driver/rocksdb/transaction_task.rs +++ b/engine/packages/universaldb/src/driver/rocksdb/transaction_task.rs @@ -137,7 +137,7 @@ impl TransactionTask { } } - fn create_transaction(&self) -> RocksDbTransaction { + fn create_transaction(&self) -> RocksDbTransaction<'_, OptimisticTransactionDB> { let write_opts = WriteOptions::default(); let txn_opts = rocksdb::OptimisticTransactionOptions::default(); self.db.transaction_opt(&write_opts, &txn_opts) diff --git a/engine/packages/universaldb/src/tx_ops.rs b/engine/packages/universaldb/src/tx_ops.rs index aedd4f41db..878ae168e8 100644 --- a/engine/packages/universaldb/src/tx_ops.rs +++ b/engine/packages/universaldb/src/tx_ops.rs @@ -34,17 +34,6 @@ pub enum Operation { }, } -impl Operation { - pub fn sorting_key(&self) -> &[u8] { - match self { - Operation::Set { key, .. } => key, - Operation::Clear { key } => key, - Operation::ClearRange { begin, .. } => begin, - Operation::AtomicOp { key, .. } => key, - } - } -} - #[derive(Debug, Clone)] pub enum GetOutput { Value(Vec), diff --git a/engine/packages/universalpubsub/src/chunking.rs b/engine/packages/universalpubsub/src/chunking.rs index 2c230efbb9..2d276233fc 100644 --- a/engine/packages/universalpubsub/src/chunking.rs +++ b/engine/packages/universalpubsub/src/chunking.rs @@ -144,7 +144,7 @@ pub fn split_payload_into_chunks( let start_ups_message = rivet_ups_protocol::UpsMessage { body: MessageBody::MessageStart(start_message), }; - let start_overhead = UpsMessage::latest(start_ups_message) + let start_overhead = UpsMessage::wrap_latest(start_ups_message) .serialize_with_embedded_version(PROTOCOL_VERSION)? .len(); @@ -157,7 +157,7 @@ pub fn split_payload_into_chunks( let chunk_ups_message = rivet_ups_protocol::UpsMessage { body: MessageBody::MessageChunk(chunk_message), }; - let chunk_overhead = UpsMessage::latest(chunk_ups_message) + let chunk_overhead = UpsMessage::wrap_latest(chunk_ups_message) .serialize_with_embedded_version(PROTOCOL_VERSION)? .len(); @@ -222,5 +222,5 @@ pub fn encode_chunk( }; let ups_message = rivet_ups_protocol::UpsMessage { body }; - UpsMessage::latest(ups_message).serialize_with_embedded_version(PROTOCOL_VERSION) + UpsMessage::wrap_latest(ups_message).serialize_with_embedded_version(PROTOCOL_VERSION) } diff --git a/engine/packages/universalpubsub/src/driver/postgres/mod.rs b/engine/packages/universalpubsub/src/driver/postgres/mod.rs index c2f20b68d0..ca801e81b2 100644 --- a/engine/packages/universalpubsub/src/driver/postgres/mod.rs +++ b/engine/packages/universalpubsub/src/driver/postgres/mod.rs @@ -292,10 +292,9 @@ impl PubSubDriver for PostgresDriver { // Try to LISTEN if client is available, but don't fail if disconnected // The reconnection logic will handle re-subscribing if let Some(client) = self.client.lock().await.clone() { - let span = tracing::trace_span!("pg_listen"); match client .execute(&format!("LISTEN \"{hashed}\""), &[]) - .instrument(span) + .instrument(tracing::trace_span!("pg_listen")) .await { Result::Ok(_) => { @@ -359,7 +358,7 @@ impl PubSubDriver for PostgresDriver { // Retry getting a connection from the pool with backoff in case the connection is // currently disconnected let mut backoff = Backoff::default(); - let mut last_error = None; + let mut last_error; loop { match self.pool.get().await { @@ -368,10 +367,9 @@ impl PubSubDriver for PostgresDriver { match conn.execute("SELECT 1", &[]).await { Result::Ok(_) => { // Connection is good, use it for NOTIFY - let span = tracing::trace_span!("pg_notify"); match conn .execute(&format!("NOTIFY \"{hashed}\", '{encoded}'"), &[]) - .instrument(span) + .instrument(tracing::trace_span!("pg_notify")) .await { Result::Ok(_) => return Ok(()), diff --git a/engine/packages/util/src/lib.rs b/engine/packages/util/src/lib.rs index 39b01b71bb..213e0be058 100644 --- a/engine/packages/util/src/lib.rs +++ b/engine/packages/util/src/lib.rs @@ -6,7 +6,6 @@ pub mod billing; pub mod check; pub mod duration; pub mod faker; -pub mod file_size; pub mod format; pub mod future; pub mod geo; @@ -14,6 +13,7 @@ pub mod math; pub mod req; pub mod serde; pub mod signal; +pub mod size; pub mod sort; pub mod timestamp; pub mod url; diff --git a/engine/packages/util/src/file_size.rs b/engine/packages/util/src/size.rs similarity index 100% rename from engine/packages/util/src/file_size.rs rename to engine/packages/util/src/size.rs diff --git a/engine/sdks/rust/data/src/versioned/mod.rs b/engine/sdks/rust/data/src/versioned/mod.rs index ca5b6ced97..f6814cad32 100644 --- a/engine/sdks/rust/data/src/versioned/mod.rs +++ b/engine/sdks/rust/data/src/versioned/mod.rs @@ -14,11 +14,11 @@ pub enum RunnerAllocIdxKeyData { impl OwnedVersionedData for RunnerAllocIdxKeyData { type Latest = pegboard_namespace_runner_alloc_idx_v1::Data; - fn latest(latest: pegboard_namespace_runner_alloc_idx_v1::Data) -> Self { + fn wrap_latest(latest: pegboard_namespace_runner_alloc_idx_v1::Data) -> Self { RunnerAllocIdxKeyData::V1(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] if let RunnerAllocIdxKeyData::V1(data) = self { Ok(data) @@ -48,11 +48,11 @@ pub enum MetadataKeyData { impl OwnedVersionedData for MetadataKeyData { type Latest = pegboard_runner_metadata_v1::Data; - fn latest(latest: pegboard_runner_metadata_v1::Data) -> Self { + fn wrap_latest(latest: pegboard_runner_metadata_v1::Data) -> Self { MetadataKeyData::V1(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] if let MetadataKeyData::V1(data) = self { Ok(data) @@ -82,11 +82,11 @@ pub enum ActorByKeyKeyData { impl OwnedVersionedData for ActorByKeyKeyData { type Latest = pegboard_namespace_actor_by_key_v1::Data; - fn latest(latest: pegboard_namespace_actor_by_key_v1::Data) -> Self { + fn wrap_latest(latest: pegboard_namespace_actor_by_key_v1::Data) -> Self { ActorByKeyKeyData::V1(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] if let ActorByKeyKeyData::V1(data) = self { Ok(data) @@ -116,11 +116,11 @@ pub enum RunnerByKeyKeyData { impl OwnedVersionedData for RunnerByKeyKeyData { type Latest = pegboard_namespace_runner_by_key_v1::Data; - fn latest(latest: pegboard_namespace_runner_by_key_v1::Data) -> Self { + fn wrap_latest(latest: pegboard_namespace_runner_by_key_v1::Data) -> Self { RunnerByKeyKeyData::V1(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] if let RunnerByKeyKeyData::V1(data) = self { Ok(data) @@ -150,11 +150,11 @@ pub enum ActorNameKeyData { impl OwnedVersionedData for ActorNameKeyData { type Latest = pegboard_namespace_actor_name_v1::Data; - fn latest(latest: pegboard_namespace_actor_name_v1::Data) -> Self { + fn wrap_latest(latest: pegboard_namespace_actor_name_v1::Data) -> Self { ActorNameKeyData::V1(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] if let ActorNameKeyData::V1(data) = self { Ok(data) diff --git a/engine/sdks/rust/data/src/versioned/namespace_runner_config.rs b/engine/sdks/rust/data/src/versioned/namespace_runner_config.rs index 252b48f47f..c902225806 100644 --- a/engine/sdks/rust/data/src/versioned/namespace_runner_config.rs +++ b/engine/sdks/rust/data/src/versioned/namespace_runner_config.rs @@ -11,11 +11,11 @@ pub enum NamespaceRunnerConfig { impl OwnedVersionedData for NamespaceRunnerConfig { type Latest = namespace_runner_config_v2::RunnerConfig; - fn latest(latest: namespace_runner_config_v2::RunnerConfig) -> Self { + fn wrap_latest(latest: namespace_runner_config_v2::RunnerConfig) -> Self { NamespaceRunnerConfig::V2(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] if let NamespaceRunnerConfig::V2(data) = self { Ok(data) @@ -87,7 +87,7 @@ impl NamespaceRunnerConfig { match self { NamespaceRunnerConfig::V1(_) => Ok(self), NamespaceRunnerConfig::V2(config) => { - let namespace_runner_config_v2::RunnerConfig { metadata, kind } = config; + let namespace_runner_config_v2::RunnerConfig { kind, .. } = config; match kind { namespace_runner_config_v2::RunnerConfigKind::Serverless(serverless) => { diff --git a/engine/sdks/rust/epoxy-protocol/src/versioned.rs b/engine/sdks/rust/epoxy-protocol/src/versioned.rs index 8475286679..20a9507685 100644 --- a/engine/sdks/rust/epoxy-protocol/src/versioned.rs +++ b/engine/sdks/rust/epoxy-protocol/src/versioned.rs @@ -10,11 +10,11 @@ pub enum Request { impl OwnedVersionedData for Request { type Latest = v1::Request; - fn latest(latest: v1::Request) -> Self { + fn wrap_latest(latest: v1::Request) -> Self { Request::V1(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] if let Request::V1(data) = self { Ok(data) @@ -50,11 +50,11 @@ pub enum Response { impl OwnedVersionedData for Response { type Latest = v1::Response; - fn latest(latest: v1::Response) -> Self { + fn wrap_latest(latest: v1::Response) -> Self { Response::V1(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] if let Response::V1(data) = self { Ok(data) @@ -90,11 +90,11 @@ pub enum LogEntry { impl OwnedVersionedData for LogEntry { type Latest = v1::LogEntry; - fn latest(latest: v1::LogEntry) -> Self { + fn wrap_latest(latest: v1::LogEntry) -> Self { LogEntry::V1(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] if let LogEntry::V1(data) = self { Ok(data) @@ -134,11 +134,11 @@ pub enum ClusterConfig { impl OwnedVersionedData for ClusterConfig { type Latest = v1::ClusterConfig; - fn latest(latest: v1::ClusterConfig) -> Self { + fn wrap_latest(latest: v1::ClusterConfig) -> Self { ClusterConfig::V1(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] if let ClusterConfig::V1(data) = self { Ok(data) @@ -178,11 +178,11 @@ pub enum Ballot { impl OwnedVersionedData for Ballot { type Latest = v1::Ballot; - fn latest(latest: v1::Ballot) -> Self { + fn wrap_latest(latest: v1::Ballot) -> Self { Ballot::V1(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] if let Ballot::V1(data) = self { Ok(data) diff --git a/engine/sdks/rust/runner-protocol/build.rs b/engine/sdks/rust/runner-protocol/build.rs index 4f1bb3902a..3d8188330a 100644 --- a/engine/sdks/rust/runner-protocol/build.rs +++ b/engine/sdks/rust/runner-protocol/build.rs @@ -22,7 +22,10 @@ fn main() -> Result<(), Box> { vbare_compiler::process_schemas_with_config(&schema_dir, &cfg)?; // TypeScript SDK generation - let cli_js_path = workspace_root.join("node_modules/@bare-ts/tools/dist/bin/cli.js"); + let cli_js_path = workspace_root + .parent() + .unwrap() + .join("node_modules/@bare-ts/tools/dist/bin/cli.js"); if cli_js_path.exists() { typescript::generate_sdk(&schema_dir); } else { @@ -59,16 +62,20 @@ mod typescript { panic!("Failed to create SDK directory: {}", e); } - let output = - Command::new(workspace_root.join("node_modules/@bare-ts/tools/dist/bin/cli.js")) - .arg("compile") - .arg("--generator") - .arg("ts") - .arg(highest_version_path) - .arg("-o") - .arg(src_dir.join("index.ts")) - .output() - .expect("Failed to execute bare compiler for TypeScript"); + let output = Command::new( + workspace_root + .parent() + .unwrap() + .join("node_modules/@bare-ts/tools/dist/bin/cli.js"), + ) + .arg("compile") + .arg("--generator") + .arg("ts") + .arg(highest_version_path) + .arg("-o") + .arg(src_dir.join("index.ts")) + .output() + .expect("Failed to execute bare compiler for TypeScript"); if !output.status.success() { panic!( diff --git a/engine/sdks/rust/runner-protocol/src/lib.rs b/engine/sdks/rust/runner-protocol/src/lib.rs index 676c99e464..04553acb49 100644 --- a/engine/sdks/rust/runner-protocol/src/lib.rs +++ b/engine/sdks/rust/runner-protocol/src/lib.rs @@ -2,6 +2,6 @@ pub mod generated; pub mod versioned; // Re-export latest -pub use generated::v1::*; +pub use generated::v2::*; -pub const PROTOCOL_VERSION: u16 = 1; +pub const PROTOCOL_VERSION: u16 = 2; diff --git a/engine/sdks/rust/runner-protocol/src/versioned.rs b/engine/sdks/rust/runner-protocol/src/versioned.rs index 75f6fa1167..eb95c194c5 100644 --- a/engine/sdks/rust/runner-protocol/src/versioned.rs +++ b/engine/sdks/rust/runner-protocol/src/versioned.rs @@ -1,22 +1,22 @@ use anyhow::{Ok, Result, bail}; use vbare::OwnedVersionedData; -use crate::{PROTOCOL_VERSION, generated::v1}; +use crate::generated::{v1, v2}; pub enum ToClient { V1(v1::ToClient), + V2(v2::ToClient), } impl OwnedVersionedData for ToClient { - type Latest = v1::ToClient; + type Latest = v2::ToClient; - fn latest(latest: v1::ToClient) -> Self { - ToClient::V1(latest) + fn wrap_latest(latest: v2::ToClient) -> Self { + ToClient::V2(latest) } - fn into_latest(self) -> Result { - #[allow(irrefutable_let_patterns)] - if let ToClient::V1(data) = self { + fn unwrap_latest(self) -> Result { + if let ToClient::V2(data) = self { Ok(data) } else { bail!("version not latest"); @@ -26,6 +26,7 @@ impl OwnedVersionedData for ToClient { fn deserialize_version(payload: &[u8], version: u16) -> Result { match version { 1 => Ok(ToClient::V1(serde_bare::from_slice(payload)?)), + 2 => Ok(ToClient::V2(serde_bare::from_slice(payload)?)), _ => bail!("invalid version: {version}"), } } @@ -33,24 +34,177 @@ impl OwnedVersionedData for ToClient { fn serialize_version(self, _version: u16) -> Result> { match self { ToClient::V1(data) => serde_bare::to_vec(&data).map_err(Into::into), + ToClient::V2(data) => serde_bare::to_vec(&data).map_err(Into::into), + } + } + + fn deserialize_converters() -> Vec Result> { + vec![Self::v1_to_v2] + } + + fn serialize_converters() -> Vec Result> { + vec![Self::v2_to_v1] + } +} + +impl ToClient { + fn v1_to_v2(self) -> Result { + match self { + ToClient::V1(x) => { + let inner = match x { + v1::ToClient::ToClientInit(init) => { + v2::ToClient::ToClientInit(v2::ToClientInit { + runner_id: init.runner_id, + last_event_idx: init.last_event_idx, + metadata: v2::ProtocolMetadata { + runner_lost_threshold: init.metadata.runner_lost_threshold, + }, + }) + } + v1::ToClient::ToClientClose => v2::ToClient::ToClientClose, + v1::ToClient::ToClientCommands(commands) => v2::ToClient::ToClientCommands( + commands + .into_iter() + .map(|cmd| v2::CommandWrapper { + index: cmd.index, + inner: match cmd.inner { + v1::Command::CommandStartActor(start) => { + v2::Command::CommandStartActor(v2::CommandStartActor { + actor_id: start.actor_id, + generation: start.generation, + config: v2::ActorConfig { + name: start.config.name, + key: start.config.key, + create_ts: start.config.create_ts, + input: start.config.input, + }, + }) + } + v1::Command::CommandStopActor(stop) => { + v2::Command::CommandStopActor(v2::CommandStopActor { + actor_id: stop.actor_id, + generation: stop.generation, + }) + } + }, + }) + .collect(), + ), + v1::ToClient::ToClientAckEvents(ack) => { + v2::ToClient::ToClientAckEvents(v2::ToClientAckEvents { + last_event_idx: ack.last_event_idx, + }) + } + v1::ToClient::ToClientKvResponse(resp) => { + v2::ToClient::ToClientKvResponse(v2::ToClientKvResponse { + request_id: resp.request_id, + data: convert_kv_response_data_v1_to_v2(resp.data), + }) + } + v1::ToClient::ToClientTunnelMessage(msg) => { + v2::ToClient::ToClientTunnelMessage(v2::ToClientTunnelMessage { + request_id: msg.request_id, + message_id: msg.message_id, + message_kind: convert_to_client_tunnel_message_kind_v1_to_v2( + msg.message_kind, + ), + gateway_reply_to: msg.gateway_reply_to, + }) + } + }; + + Ok(ToClient::V2(inner)) + } + value @ ToClient::V2(_) => Ok(value), + } + } + + fn v2_to_v1(self) -> Result { + match self { + ToClient::V1(_) => Ok(self), + ToClient::V2(x) => { + let inner = match x { + v2::ToClient::ToClientInit(init) => { + v1::ToClient::ToClientInit(v1::ToClientInit { + runner_id: init.runner_id, + last_event_idx: init.last_event_idx, + metadata: v1::ProtocolMetadata { + runner_lost_threshold: init.metadata.runner_lost_threshold, + }, + }) + } + v2::ToClient::ToClientClose => v1::ToClient::ToClientClose, + v2::ToClient::ToClientCommands(commands) => v1::ToClient::ToClientCommands( + commands + .into_iter() + .map(|cmd| v1::CommandWrapper { + index: cmd.index, + inner: match cmd.inner { + v2::Command::CommandStartActor(start) => { + v1::Command::CommandStartActor(v1::CommandStartActor { + actor_id: start.actor_id, + generation: start.generation, + config: v1::ActorConfig { + name: start.config.name, + key: start.config.key, + create_ts: start.config.create_ts, + input: start.config.input, + }, + }) + } + v2::Command::CommandStopActor(stop) => { + v1::Command::CommandStopActor(v1::CommandStopActor { + actor_id: stop.actor_id, + generation: stop.generation, + }) + } + }, + }) + .collect(), + ), + v2::ToClient::ToClientAckEvents(ack) => { + v1::ToClient::ToClientAckEvents(v1::ToClientAckEvents { + last_event_idx: ack.last_event_idx, + }) + } + v2::ToClient::ToClientKvResponse(resp) => { + v1::ToClient::ToClientKvResponse(v1::ToClientKvResponse { + request_id: resp.request_id, + data: convert_kv_response_data_v2_to_v1(resp.data), + }) + } + v2::ToClient::ToClientTunnelMessage(msg) => { + v1::ToClient::ToClientTunnelMessage(v1::ToClientTunnelMessage { + request_id: msg.request_id, + message_id: msg.message_id, + message_kind: convert_to_client_tunnel_message_kind_v2_to_v1( + msg.message_kind, + )?, + gateway_reply_to: msg.gateway_reply_to, + }) + } + }; + + Ok(ToClient::V1(inner)) + } } } } pub enum ToServer { V1(v1::ToServer), + V2(v2::ToServer), } impl OwnedVersionedData for ToServer { - type Latest = v1::ToServer; + type Latest = v2::ToServer; - fn latest(latest: v1::ToServer) -> Self { - ToServer::V1(latest) + fn wrap_latest(latest: v2::ToServer) -> Self { + ToServer::V2(latest) } - fn into_latest(self) -> Result { - #[allow(irrefutable_let_patterns)] - if let ToServer::V1(data) = self { + fn unwrap_latest(self) -> Result { + if let ToServer::V2(data) = self { Ok(data) } else { bail!("version not latest"); @@ -60,6 +214,7 @@ impl OwnedVersionedData for ToServer { fn deserialize_version(payload: &[u8], version: u16) -> Result { match version { 1 => Ok(ToServer::V1(serde_bare::from_slice(payload)?)), + 2 => Ok(ToServer::V2(serde_bare::from_slice(payload)?)), _ => bail!("invalid version: {version}"), } } @@ -67,24 +222,170 @@ impl OwnedVersionedData for ToServer { fn serialize_version(self, _version: u16) -> Result> { match self { ToServer::V1(data) => serde_bare::to_vec(&data).map_err(Into::into), + ToServer::V2(data) => serde_bare::to_vec(&data).map_err(Into::into), + } + } + + fn deserialize_converters() -> Vec Result> { + vec![Self::v1_to_v2] + } + + fn serialize_converters() -> Vec Result> { + vec![Self::v2_to_v1] + } +} + +impl ToServer { + fn v1_to_v2(self) -> Result { + match self { + ToServer::V1(x) => { + let inner = match x { + v1::ToServer::ToServerInit(init) => { + v2::ToServer::ToServerInit(v2::ToServerInit { + name: init.name, + version: init.version, + total_slots: init.total_slots, + last_command_idx: init.last_command_idx, + prepopulate_actor_names: init.prepopulate_actor_names.map(|map| { + map.into_iter() + .map(|(k, v)| { + ( + k, + v2::ActorName { + metadata: v.metadata, + }, + ) + }) + .collect() + }), + metadata: init.metadata, + }) + } + v1::ToServer::ToServerEvents(events) => v2::ToServer::ToServerEvents( + events + .into_iter() + .map(|event| v2::EventWrapper { + index: event.index, + inner: convert_event_v1_to_v2(event.inner), + }) + .collect(), + ), + v1::ToServer::ToServerAckCommands(ack) => { + v2::ToServer::ToServerAckCommands(v2::ToServerAckCommands { + last_command_idx: ack.last_command_idx, + }) + } + v1::ToServer::ToServerStopping => v2::ToServer::ToServerStopping, + v1::ToServer::ToServerPing(ping) => { + v2::ToServer::ToServerPing(v2::ToServerPing { ts: ping.ts }) + } + v1::ToServer::ToServerKvRequest(req) => { + v2::ToServer::ToServerKvRequest(v2::ToServerKvRequest { + actor_id: req.actor_id, + request_id: req.request_id, + data: convert_kv_request_data_v1_to_v2(req.data), + }) + } + v1::ToServer::ToServerTunnelMessage(msg) => { + v2::ToServer::ToServerTunnelMessage(v2::ToServerTunnelMessage { + request_id: msg.request_id, + message_id: msg.message_id, + message_kind: convert_to_server_tunnel_message_kind_v1_to_v2( + msg.message_kind, + ), + }) + } + }; + + Ok(ToServer::V2(inner)) + } + value @ ToServer::V2(_) => Ok(value), + } + } + + fn v2_to_v1(self) -> Result { + match self { + ToServer::V1(_) => Ok(self), + ToServer::V2(x) => { + let inner = match x { + v2::ToServer::ToServerInit(init) => { + v1::ToServer::ToServerInit(v1::ToServerInit { + name: init.name, + version: init.version, + total_slots: init.total_slots, + last_command_idx: init.last_command_idx, + prepopulate_actor_names: init.prepopulate_actor_names.map(|map| { + map.into_iter() + .map(|(k, v)| { + ( + k, + v1::ActorName { + metadata: v.metadata, + }, + ) + }) + .collect() + }), + metadata: init.metadata, + }) + } + v2::ToServer::ToServerEvents(events) => v1::ToServer::ToServerEvents( + events + .into_iter() + .map(|event| v1::EventWrapper { + index: event.index, + inner: convert_event_v2_to_v1(event.inner), + }) + .collect(), + ), + v2::ToServer::ToServerAckCommands(ack) => { + v1::ToServer::ToServerAckCommands(v1::ToServerAckCommands { + last_command_idx: ack.last_command_idx, + }) + } + v2::ToServer::ToServerStopping => v1::ToServer::ToServerStopping, + v2::ToServer::ToServerPing(ping) => { + v1::ToServer::ToServerPing(v1::ToServerPing { ts: ping.ts }) + } + v2::ToServer::ToServerKvRequest(req) => { + v1::ToServer::ToServerKvRequest(v1::ToServerKvRequest { + actor_id: req.actor_id, + request_id: req.request_id, + data: convert_kv_request_data_v2_to_v1(req.data), + }) + } + v2::ToServer::ToServerTunnelMessage(msg) => { + v1::ToServer::ToServerTunnelMessage(v1::ToServerTunnelMessage { + request_id: msg.request_id, + message_id: msg.message_id, + message_kind: convert_to_server_tunnel_message_kind_v2_to_v1( + msg.message_kind, + )?, + }) + } + }; + + Ok(ToServer::V1(inner)) + } } } } pub enum ToGateway { - V1(v1::ToGateway), + // No change between v1 and v2 + V2(v2::ToGateway), } impl OwnedVersionedData for ToGateway { - type Latest = v1::ToGateway; + type Latest = v2::ToGateway; - fn latest(latest: v1::ToGateway) -> Self { - ToGateway::V1(latest) + fn wrap_latest(latest: v2::ToGateway) -> Self { + ToGateway::V2(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] - if let ToGateway::V1(data) = self { + if let ToGateway::V2(data) = self { Ok(data) } else { bail!("version not latest"); @@ -93,38 +394,33 @@ impl OwnedVersionedData for ToGateway { fn deserialize_version(payload: &[u8], version: u16) -> Result { match version { - 1 => Ok(ToGateway::V1(serde_bare::from_slice(payload)?)), + 1 | 2 => Ok(ToGateway::V2(serde_bare::from_slice(payload)?)), _ => bail!("invalid version: {version}"), } } fn serialize_version(self, _version: u16) -> Result> { match self { - ToGateway::V1(data) => serde_bare::to_vec(&data).map_err(Into::into), + ToGateway::V2(data) => serde_bare::to_vec(&data).map_err(Into::into), } } } -impl ToGateway { - pub fn serialize(self) -> Result> { - ::serialize(self, PROTOCOL_VERSION) - } -} - pub enum ToServerlessServer { - V1(v1::ToServerlessServer), + // No change between v1 and v2 + V2(v2::ToServerlessServer), } impl OwnedVersionedData for ToServerlessServer { - type Latest = v1::ToServerlessServer; + type Latest = v2::ToServerlessServer; - fn latest(latest: v1::ToServerlessServer) -> Self { - ToServerlessServer::V1(latest) + fn wrap_latest(latest: v2::ToServerlessServer) -> Self { + ToServerlessServer::V2(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] - if let ToServerlessServer::V1(data) = self { + if let ToServerlessServer::V2(data) = self { Ok(data) } else { bail!("version not latest"); @@ -133,14 +429,459 @@ impl OwnedVersionedData for ToServerlessServer { fn deserialize_version(payload: &[u8], version: u16) -> Result { match version { - 1 => Ok(ToServerlessServer::V1(serde_bare::from_slice(payload)?)), + 1 | 2 => Ok(ToServerlessServer::V2(serde_bare::from_slice(payload)?)), _ => bail!("invalid version: {version}"), } } fn serialize_version(self, _version: u16) -> Result> { match self { - ToServerlessServer::V1(data) => serde_bare::to_vec(&data).map_err(Into::into), + ToServerlessServer::V2(data) => serde_bare::to_vec(&data).map_err(Into::into), + } + } +} + +// Helper conversion functions +fn convert_to_client_tunnel_message_kind_v1_to_v2( + kind: v1::ToClientTunnelMessageKind, +) -> v2::ToClientTunnelMessageKind { + match kind { + v1::ToClientTunnelMessageKind::TunnelAck => v2::ToClientTunnelMessageKind::TunnelAck, + v1::ToClientTunnelMessageKind::ToClientRequestStart(req) => { + v2::ToClientTunnelMessageKind::ToClientRequestStart(v2::ToClientRequestStart { + actor_id: req.actor_id, + method: req.method, + path: req.path, + headers: req.headers, + body: req.body, + stream: req.stream, + }) + } + v1::ToClientTunnelMessageKind::ToClientRequestChunk(chunk) => { + v2::ToClientTunnelMessageKind::ToClientRequestChunk(v2::ToClientRequestChunk { + body: chunk.body, + finish: chunk.finish, + }) + } + v1::ToClientTunnelMessageKind::ToClientRequestAbort => { + v2::ToClientTunnelMessageKind::ToClientRequestAbort + } + v1::ToClientTunnelMessageKind::ToClientWebSocketOpen(ws) => { + v2::ToClientTunnelMessageKind::ToClientWebSocketOpen(v2::ToClientWebSocketOpen { + actor_id: ws.actor_id, + path: ws.path, + headers: ws.headers, + }) + } + v1::ToClientTunnelMessageKind::ToClientWebSocketMessage(msg) => { + v2::ToClientTunnelMessageKind::ToClientWebSocketMessage(v2::ToClientWebSocketMessage { + // Default to 0 for v1 messages (hibernation disabled by default) + index: 0, + data: msg.data, + binary: msg.binary, + }) + } + v1::ToClientTunnelMessageKind::ToClientWebSocketClose(close) => { + v2::ToClientTunnelMessageKind::ToClientWebSocketClose(v2::ToClientWebSocketClose { + code: close.code, + reason: close.reason, + }) + } + } +} + +fn convert_to_client_tunnel_message_kind_v2_to_v1( + kind: v2::ToClientTunnelMessageKind, +) -> Result { + Ok(match kind { + v2::ToClientTunnelMessageKind::TunnelAck => v1::ToClientTunnelMessageKind::TunnelAck, + v2::ToClientTunnelMessageKind::ToClientRequestStart(req) => { + v1::ToClientTunnelMessageKind::ToClientRequestStart(v1::ToClientRequestStart { + actor_id: req.actor_id, + method: req.method, + path: req.path, + headers: req.headers, + body: req.body, + stream: req.stream, + }) + } + v2::ToClientTunnelMessageKind::ToClientRequestChunk(chunk) => { + v1::ToClientTunnelMessageKind::ToClientRequestChunk(v1::ToClientRequestChunk { + body: chunk.body, + finish: chunk.finish, + }) + } + v2::ToClientTunnelMessageKind::ToClientRequestAbort => { + v1::ToClientTunnelMessageKind::ToClientRequestAbort + } + v2::ToClientTunnelMessageKind::ToClientWebSocketOpen(ws) => { + v1::ToClientTunnelMessageKind::ToClientWebSocketOpen(v1::ToClientWebSocketOpen { + actor_id: ws.actor_id, + path: ws.path, + headers: ws.headers, + }) + } + v2::ToClientTunnelMessageKind::ToClientWebSocketMessage(msg) => { + v1::ToClientTunnelMessageKind::ToClientWebSocketMessage(v1::ToClientWebSocketMessage { + data: msg.data, + binary: msg.binary, + }) + } + v2::ToClientTunnelMessageKind::ToClientWebSocketClose(close) => { + v1::ToClientTunnelMessageKind::ToClientWebSocketClose(v1::ToClientWebSocketClose { + code: close.code, + reason: close.reason, + }) + } + }) +} + +fn convert_to_server_tunnel_message_kind_v1_to_v2( + kind: v1::ToServerTunnelMessageKind, +) -> v2::ToServerTunnelMessageKind { + match kind { + v1::ToServerTunnelMessageKind::TunnelAck => v2::ToServerTunnelMessageKind::TunnelAck, + v1::ToServerTunnelMessageKind::ToServerResponseStart(resp) => { + v2::ToServerTunnelMessageKind::ToServerResponseStart(v2::ToServerResponseStart { + status: resp.status, + headers: resp.headers, + body: resp.body, + stream: resp.stream, + }) + } + v1::ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => { + v2::ToServerTunnelMessageKind::ToServerResponseChunk(v2::ToServerResponseChunk { + body: chunk.body, + finish: chunk.finish, + }) + } + v1::ToServerTunnelMessageKind::ToServerResponseAbort => { + v2::ToServerTunnelMessageKind::ToServerResponseAbort + } + v1::ToServerTunnelMessageKind::ToServerWebSocketOpen => { + v2::ToServerTunnelMessageKind::ToServerWebSocketOpen(v2::ToServerWebSocketOpen { + can_hibernate: false, + last_msg_index: -1, + }) + } + v1::ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => { + v2::ToServerTunnelMessageKind::ToServerWebSocketMessage(v2::ToServerWebSocketMessage { + data: msg.data, + binary: msg.binary, + }) + } + v1::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => { + v2::ToServerTunnelMessageKind::ToServerWebSocketClose(v2::ToServerWebSocketClose { + code: close.code, + reason: close.reason, + retry: false, + }) + } + } +} + +fn convert_to_server_tunnel_message_kind_v2_to_v1( + kind: v2::ToServerTunnelMessageKind, +) -> Result { + Ok(match kind { + v2::ToServerTunnelMessageKind::TunnelAck => v1::ToServerTunnelMessageKind::TunnelAck, + v2::ToServerTunnelMessageKind::ToServerResponseStart(resp) => { + v1::ToServerTunnelMessageKind::ToServerResponseStart(v1::ToServerResponseStart { + status: resp.status, + headers: resp.headers, + body: resp.body, + stream: resp.stream, + }) + } + v2::ToServerTunnelMessageKind::ToServerResponseChunk(chunk) => { + v1::ToServerTunnelMessageKind::ToServerResponseChunk(v1::ToServerResponseChunk { + body: chunk.body, + finish: chunk.finish, + }) + } + v2::ToServerTunnelMessageKind::ToServerResponseAbort => { + v1::ToServerTunnelMessageKind::ToServerResponseAbort + } + v2::ToServerTunnelMessageKind::ToServerWebSocketOpen(_) => { + v1::ToServerTunnelMessageKind::ToServerWebSocketOpen + } + v2::ToServerTunnelMessageKind::ToServerWebSocketMessage(msg) => { + v1::ToServerTunnelMessageKind::ToServerWebSocketMessage(v1::ToServerWebSocketMessage { + data: msg.data, + binary: msg.binary, + }) + } + v2::ToServerTunnelMessageKind::ToServerWebSocketMessageAck(_) => { + // v1 doesn't have MessageAck, this is a v2-only feature + bail!("ToServerWebSocketMessageAck is not supported in v1"); } + v2::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => { + v1::ToServerTunnelMessageKind::ToServerWebSocketClose(v1::ToServerWebSocketClose { + code: close.code, + reason: close.reason, + }) + } + }) +} + +fn convert_event_v1_to_v2(event: v1::Event) -> v2::Event { + match event { + v1::Event::EventActorIntent(intent) => v2::Event::EventActorIntent(v2::EventActorIntent { + actor_id: intent.actor_id, + generation: intent.generation, + intent: convert_actor_intent_v1_to_v2(intent.intent), + }), + v1::Event::EventActorStateUpdate(state) => { + v2::Event::EventActorStateUpdate(v2::EventActorStateUpdate { + actor_id: state.actor_id, + generation: state.generation, + state: convert_actor_state_v1_to_v2(state.state), + }) + } + v1::Event::EventActorSetAlarm(alarm) => { + v2::Event::EventActorSetAlarm(v2::EventActorSetAlarm { + actor_id: alarm.actor_id, + generation: alarm.generation, + alarm_ts: alarm.alarm_ts, + }) + } + } +} + +fn convert_event_v2_to_v1(event: v2::Event) -> v1::Event { + match event { + v2::Event::EventActorIntent(intent) => v1::Event::EventActorIntent(v1::EventActorIntent { + actor_id: intent.actor_id, + generation: intent.generation, + intent: convert_actor_intent_v2_to_v1(intent.intent), + }), + v2::Event::EventActorStateUpdate(state) => { + v1::Event::EventActorStateUpdate(v1::EventActorStateUpdate { + actor_id: state.actor_id, + generation: state.generation, + state: convert_actor_state_v2_to_v1(state.state), + }) + } + v2::Event::EventActorSetAlarm(alarm) => { + v1::Event::EventActorSetAlarm(v1::EventActorSetAlarm { + actor_id: alarm.actor_id, + generation: alarm.generation, + alarm_ts: alarm.alarm_ts, + }) + } + } +} + +fn convert_actor_intent_v1_to_v2(intent: v1::ActorIntent) -> v2::ActorIntent { + match intent { + v1::ActorIntent::ActorIntentSleep => v2::ActorIntent::ActorIntentSleep, + v1::ActorIntent::ActorIntentStop => v2::ActorIntent::ActorIntentStop, + } +} + +fn convert_actor_intent_v2_to_v1(intent: v2::ActorIntent) -> v1::ActorIntent { + match intent { + v2::ActorIntent::ActorIntentSleep => v1::ActorIntent::ActorIntentSleep, + v2::ActorIntent::ActorIntentStop => v1::ActorIntent::ActorIntentStop, + } +} + +fn convert_actor_state_v1_to_v2(state: v1::ActorState) -> v2::ActorState { + match state { + v1::ActorState::ActorStateRunning => v2::ActorState::ActorStateRunning, + v1::ActorState::ActorStateStopped(stopped) => { + v2::ActorState::ActorStateStopped(v2::ActorStateStopped { + code: convert_stop_code_v1_to_v2(stopped.code), + message: stopped.message, + }) + } + } +} + +fn convert_actor_state_v2_to_v1(state: v2::ActorState) -> v1::ActorState { + match state { + v2::ActorState::ActorStateRunning => v1::ActorState::ActorStateRunning, + v2::ActorState::ActorStateStopped(stopped) => { + v1::ActorState::ActorStateStopped(v1::ActorStateStopped { + code: convert_stop_code_v2_to_v1(stopped.code), + message: stopped.message, + }) + } + } +} + +fn convert_stop_code_v1_to_v2(code: v1::StopCode) -> v2::StopCode { + match code { + v1::StopCode::Ok => v2::StopCode::Ok, + v1::StopCode::Error => v2::StopCode::Error, + } +} + +fn convert_stop_code_v2_to_v1(code: v2::StopCode) -> v1::StopCode { + match code { + v2::StopCode::Ok => v1::StopCode::Ok, + v2::StopCode::Error => v1::StopCode::Error, + } +} + +fn convert_kv_request_data_v1_to_v2(data: v1::KvRequestData) -> v2::KvRequestData { + match data { + v1::KvRequestData::KvGetRequest(req) => { + v2::KvRequestData::KvGetRequest(v2::KvGetRequest { keys: req.keys }) + } + v1::KvRequestData::KvListRequest(req) => { + v2::KvRequestData::KvListRequest(v2::KvListRequest { + query: convert_kv_list_query_v1_to_v2(req.query), + reverse: req.reverse, + limit: req.limit, + }) + } + v1::KvRequestData::KvPutRequest(req) => v2::KvRequestData::KvPutRequest(v2::KvPutRequest { + keys: req.keys, + values: req.values, + }), + v1::KvRequestData::KvDeleteRequest(req) => { + v2::KvRequestData::KvDeleteRequest(v2::KvDeleteRequest { keys: req.keys }) + } + v1::KvRequestData::KvDropRequest => v2::KvRequestData::KvDropRequest, + } +} + +fn convert_kv_request_data_v2_to_v1(data: v2::KvRequestData) -> v1::KvRequestData { + match data { + v2::KvRequestData::KvGetRequest(req) => { + v1::KvRequestData::KvGetRequest(v1::KvGetRequest { keys: req.keys }) + } + v2::KvRequestData::KvListRequest(req) => { + v1::KvRequestData::KvListRequest(v1::KvListRequest { + query: convert_kv_list_query_v2_to_v1(req.query), + reverse: req.reverse, + limit: req.limit, + }) + } + v2::KvRequestData::KvPutRequest(req) => v1::KvRequestData::KvPutRequest(v1::KvPutRequest { + keys: req.keys, + values: req.values, + }), + v2::KvRequestData::KvDeleteRequest(req) => { + v1::KvRequestData::KvDeleteRequest(v1::KvDeleteRequest { keys: req.keys }) + } + v2::KvRequestData::KvDropRequest => v1::KvRequestData::KvDropRequest, + } +} + +fn convert_kv_response_data_v1_to_v2(data: v1::KvResponseData) -> v2::KvResponseData { + match data { + v1::KvResponseData::KvErrorResponse(err) => { + v2::KvResponseData::KvErrorResponse(v2::KvErrorResponse { + message: err.message, + }) + } + v1::KvResponseData::KvGetResponse(resp) => { + v2::KvResponseData::KvGetResponse(v2::KvGetResponse { + keys: resp.keys, + values: resp.values, + metadata: resp + .metadata + .into_iter() + .map(convert_kv_metadata_v1_to_v2) + .collect(), + }) + } + v1::KvResponseData::KvListResponse(resp) => { + v2::KvResponseData::KvListResponse(v2::KvListResponse { + keys: resp.keys, + values: resp.values, + metadata: resp + .metadata + .into_iter() + .map(convert_kv_metadata_v1_to_v2) + .collect(), + }) + } + v1::KvResponseData::KvPutResponse => v2::KvResponseData::KvPutResponse, + v1::KvResponseData::KvDeleteResponse => v2::KvResponseData::KvDeleteResponse, + v1::KvResponseData::KvDropResponse => v2::KvResponseData::KvDropResponse, + } +} + +fn convert_kv_response_data_v2_to_v1(data: v2::KvResponseData) -> v1::KvResponseData { + match data { + v2::KvResponseData::KvErrorResponse(err) => { + v1::KvResponseData::KvErrorResponse(v1::KvErrorResponse { + message: err.message, + }) + } + v2::KvResponseData::KvGetResponse(resp) => { + v1::KvResponseData::KvGetResponse(v1::KvGetResponse { + keys: resp.keys, + values: resp.values, + metadata: resp + .metadata + .into_iter() + .map(convert_kv_metadata_v2_to_v1) + .collect(), + }) + } + v2::KvResponseData::KvListResponse(resp) => { + v1::KvResponseData::KvListResponse(v1::KvListResponse { + keys: resp.keys, + values: resp.values, + metadata: resp + .metadata + .into_iter() + .map(convert_kv_metadata_v2_to_v1) + .collect(), + }) + } + v2::KvResponseData::KvPutResponse => v1::KvResponseData::KvPutResponse, + v2::KvResponseData::KvDeleteResponse => v1::KvResponseData::KvDeleteResponse, + v2::KvResponseData::KvDropResponse => v1::KvResponseData::KvDropResponse, + } +} + +fn convert_kv_list_query_v1_to_v2(query: v1::KvListQuery) -> v2::KvListQuery { + match query { + v1::KvListQuery::KvListAllQuery => v2::KvListQuery::KvListAllQuery, + v1::KvListQuery::KvListRangeQuery(range) => { + v2::KvListQuery::KvListRangeQuery(v2::KvListRangeQuery { + start: range.start, + end: range.end, + exclusive: range.exclusive, + }) + } + v1::KvListQuery::KvListPrefixQuery(prefix) => { + v2::KvListQuery::KvListPrefixQuery(v2::KvListPrefixQuery { key: prefix.key }) + } + } +} + +fn convert_kv_list_query_v2_to_v1(query: v2::KvListQuery) -> v1::KvListQuery { + match query { + v2::KvListQuery::KvListAllQuery => v1::KvListQuery::KvListAllQuery, + v2::KvListQuery::KvListRangeQuery(range) => { + v1::KvListQuery::KvListRangeQuery(v1::KvListRangeQuery { + start: range.start, + end: range.end, + exclusive: range.exclusive, + }) + } + v2::KvListQuery::KvListPrefixQuery(prefix) => { + v1::KvListQuery::KvListPrefixQuery(v1::KvListPrefixQuery { key: prefix.key }) + } + } +} + +fn convert_kv_metadata_v1_to_v2(metadata: v1::KvMetadata) -> v2::KvMetadata { + v2::KvMetadata { + version: metadata.version, + create_ts: metadata.create_ts, + } +} + +fn convert_kv_metadata_v2_to_v1(metadata: v2::KvMetadata) -> v1::KvMetadata { + v1::KvMetadata { + version: metadata.version, + create_ts: metadata.create_ts, } } diff --git a/engine/sdks/rust/ups-protocol/src/versioned.rs b/engine/sdks/rust/ups-protocol/src/versioned.rs index a4cb0fb07e..7f6eeae70c 100644 --- a/engine/sdks/rust/ups-protocol/src/versioned.rs +++ b/engine/sdks/rust/ups-protocol/src/versioned.rs @@ -10,11 +10,11 @@ pub enum UpsMessage { impl OwnedVersionedData for UpsMessage { type Latest = v1::UpsMessage; - fn latest(latest: v1::UpsMessage) -> Self { + fn wrap_latest(latest: v1::UpsMessage) -> Self { UpsMessage::V1(latest) } - fn into_latest(self) -> Result { + fn unwrap_latest(self) -> Result { #[allow(irrefutable_let_patterns)] if let UpsMessage::V1(data) = self { Ok(data) diff --git a/engine/sdks/schemas/epoxy-protocol/v1.bare b/engine/sdks/schemas/epoxy-protocol/v1.bare index 88558c1491..f3a0e04327 100644 --- a/engine/sdks/schemas/epoxy-protocol/v1.bare +++ b/engine/sdks/schemas/epoxy-protocol/v1.bare @@ -213,6 +213,11 @@ type KvGetResponse struct { value: optional } +type KvPurgeRequest struct { + keys: list +} + +type KvPurgeResponse void # MARK: Request/Response type RequestKind union { @@ -225,7 +230,8 @@ type RequestKind union { HealthCheckRequest | CoordinatorUpdateReplicaStatusRequest | BeginLearningRequest | - KvGetRequest + KvGetRequest | + KvPurgeRequest } type Request struct { @@ -244,7 +250,8 @@ type ResponseKind union { HealthCheckResponse | CoordinatorUpdateReplicaStatusResponse | BeginLearningResponse | - KvGetResponse + KvGetResponse | + KvPurgeResponse } type Response struct { diff --git a/engine/sdks/schemas/runner-protocol/v2.bare b/engine/sdks/schemas/runner-protocol/v2.bare new file mode 100644 index 0000000000..5d2e499f44 --- /dev/null +++ b/engine/sdks/schemas/runner-protocol/v2.bare @@ -0,0 +1,403 @@ +# Runner Protocol v1 + +# MARK: Core Primitives + +type Id str +type Json str + +# MARK: KV + +# Basic types +type KvKey data +type KvValue data +type KvMetadata struct { + version: data + createTs: i64 +} + +# Query types +type KvListAllQuery void +type KvListRangeQuery struct { + start: KvKey + end: KvKey + exclusive: bool +} + +type KvListPrefixQuery struct { + key: KvKey +} + +type KvListQuery union { + KvListAllQuery | + KvListRangeQuery | + KvListPrefixQuery +} + +# Request types +type KvGetRequest struct { + keys: list +} + +type KvListRequest struct { + query: KvListQuery + reverse: optional + limit: optional +} + +type KvPutRequest struct { + keys: list + values: list +} + +type KvDeleteRequest struct { + keys: list +} + +type KvDropRequest void + +# Response types +type KvErrorResponse struct { + message: str +} + +type KvGetResponse struct { + keys: list + values: list + metadata: list +} + +type KvListResponse struct { + keys: list + values: list + metadata: list +} + +type KvPutResponse void +type KvDeleteResponse void +type KvDropResponse void + +# Request/Response unions +type KvRequestData union { + KvGetRequest | + KvListRequest | + KvPutRequest | + KvDeleteRequest | + KvDropRequest +} + +type KvResponseData union { + KvErrorResponse | + KvGetResponse | + KvListResponse | + KvPutResponse | + KvDeleteResponse | + KvDropResponse +} + +# MARK: Actor + +# Core +type StopCode enum { + OK + ERROR +} + +type ActorName struct { + metadata: Json +} + +type ActorConfig struct { + name: str + key: optional + createTs: i64 + input: optional +} + +# Intent +type ActorIntentSleep void + +type ActorIntentStop void + +type ActorIntent union { + ActorIntentSleep | + ActorIntentStop +} + +# State +type ActorStateRunning void + +type ActorStateStopped struct { + code: StopCode + message: optional +} + +type ActorState union { + ActorStateRunning | + ActorStateStopped +} + +# MARK: Events +type EventActorIntent struct { + actorId: Id + generation: u32 + intent: ActorIntent +} + +type EventActorStateUpdate struct { + actorId: Id + generation: u32 + state: ActorState +} + +type EventActorSetAlarm struct { + actorId: Id + generation: u32 + alarmTs: optional +} + +type Event union { + EventActorIntent | + EventActorStateUpdate | + EventActorSetAlarm +} + +type EventWrapper struct { + index: i64 + inner: Event +} + +# MARK: Commands +# +type CommandStartActor struct { + actorId: Id + generation: u32 + config: ActorConfig +} + +type CommandStopActor struct { + actorId: Id + generation: u32 +} + +type Command union { + CommandStartActor | + CommandStopActor +} + +type CommandWrapper struct { + index: i64 + inner: Command +} + +# MARK: Tunnel + +type RequestId data[16] # UUIDv4 +type MessageId data[16] # UUIDv4 + + +# Ack +type TunnelAck void + +# HTTP +type ToClientRequestStart struct { + actorId: Id + method: str + path: str + headers: map + body: optional + stream: bool +} + +type ToClientRequestChunk struct { + body: data + finish: bool +} + +type ToClientRequestAbort void + +type ToServerResponseStart struct { + status: u16 + headers: map + body: optional + stream: bool +} + +type ToServerResponseChunk struct { + body: data + finish: bool +} + +type ToServerResponseAbort void + +# WebSocket +type ToClientWebSocketOpen struct { + actorId: Id + path: str + headers: map +} + +type ToClientWebSocketMessage struct { + index: u16 + data: data + binary: bool +} + +type ToClientWebSocketClose struct { + code: optional + reason: optional +} + +type ToServerWebSocketOpen struct { + canHibernate: bool + lastMsgIndex: i64 +} + +type ToServerWebSocketMessage struct { + data: data + binary: bool +} + +type ToServerWebSocketMessageAck struct { + index: u16 +} + +type ToServerWebSocketClose struct { + code: optional + reason: optional + retry: bool +} + +# To Server +type ToServerTunnelMessageKind union { + TunnelAck | + + # HTTP + ToServerResponseStart | + ToServerResponseChunk | + ToServerResponseAbort | + + # WebSocket + ToServerWebSocketOpen | + ToServerWebSocketMessage | + ToServerWebSocketMessageAck | + ToServerWebSocketClose +} + +type ToServerTunnelMessage struct { + requestId: RequestId + messageId: MessageId + messageKind: ToServerTunnelMessageKind +} + +# To Client +type ToClientTunnelMessageKind union { + TunnelAck | + + # HTTP + ToClientRequestStart | + ToClientRequestChunk | + ToClientRequestAbort | + + # WebSocket + ToClientWebSocketOpen | + ToClientWebSocketMessage | + ToClientWebSocketClose +} + +type ToClientTunnelMessage struct { + requestId: RequestId + messageId: MessageId + messageKind: ToClientTunnelMessageKind + + # Subject to send replies to. + # + # Only sent when opening a new request from gateway -> pegboard-runner-ws. + # + # Should be stripped before sending to the runner. + gatewayReplyTo: optional +} + +# MARK: To Server +type ToServerInit struct { + name: str + version: u32 + totalSlots: u32 + lastCommandIdx: optional + prepopulateActorNames: optional> + metadata: optional +} + +type ToServerEvents list + +type ToServerAckCommands struct { + lastCommandIdx: i64 +} + +type ToServerStopping void + +type ToServerPing struct { + ts: i64 +} + +type ToServerKvRequest struct { + actorId: Id + requestId: u32 + data: KvRequestData +} + +type ToServer union { + ToServerInit | + ToServerEvents | + ToServerAckCommands | + ToServerStopping | + ToServerPing | + ToServerKvRequest | + ToServerTunnelMessage +} + +# MARK: To Client +type ProtocolMetadata struct { + runnerLostThreshold: i64 +} + +type ToClientInit struct { + runnerId: Id + lastEventIdx: i64 + metadata: ProtocolMetadata +} + +type ToClientCommands list + +type ToClientAckEvents struct { + lastEventIdx: i64 +} + +type ToClientKvResponse struct { + requestId: u32 + data: KvResponseData +} + +type ToClientClose void + +type ToClient union { + ToClientInit | + ToClientClose | + ToClientCommands | + ToClientAckEvents | + ToClientKvResponse | + ToClientTunnelMessage +} + +# MARK: To Gateway +type ToGateway struct { + message: ToServerTunnelMessage +} + +# MARK: Serverless +type ToServerlessServerInit struct { + runnerId: Id +} + +type ToServerlessServer union { + ToServerlessServerInit +} diff --git a/engine/sdks/typescript/runner-protocol/src/index.ts b/engine/sdks/typescript/runner-protocol/src/index.ts index 798b86a04d..c6405665cb 100644 --- a/engine/sdks/typescript/runner-protocol/src/index.ts +++ b/engine/sdks/typescript/runner-protocol/src/index.ts @@ -1063,18 +1063,21 @@ export function writeToClientWebSocketOpen(bc: bare.ByteCursor, x: ToClientWebSo } export type ToClientWebSocketMessage = { + readonly index: u16 readonly data: ArrayBuffer readonly binary: boolean } export function readToClientWebSocketMessage(bc: bare.ByteCursor): ToClientWebSocketMessage { return { + index: bare.readU16(bc), data: bare.readData(bc), binary: bare.readBool(bc), } } export function writeToClientWebSocketMessage(bc: bare.ByteCursor, x: ToClientWebSocketMessage): void { + bare.writeU16(bc, x.index) bare.writeData(bc, x.data) bare.writeBool(bc, x.binary) } @@ -1107,7 +1110,22 @@ export function writeToClientWebSocketClose(bc: bare.ByteCursor, x: ToClientWebS write5(bc, x.reason) } -export type ToServerWebSocketOpen = null +export type ToServerWebSocketOpen = { + readonly canHibernate: boolean + readonly lastMsgIndex: i64 +} + +export function readToServerWebSocketOpen(bc: bare.ByteCursor): ToServerWebSocketOpen { + return { + canHibernate: bare.readBool(bc), + lastMsgIndex: bare.readI64(bc), + } +} + +export function writeToServerWebSocketOpen(bc: bare.ByteCursor, x: ToServerWebSocketOpen): void { + bare.writeBool(bc, x.canHibernate) + bare.writeI64(bc, x.lastMsgIndex) +} export type ToServerWebSocketMessage = { readonly data: ArrayBuffer @@ -1126,21 +1144,38 @@ export function writeToServerWebSocketMessage(bc: bare.ByteCursor, x: ToServerWe bare.writeBool(bc, x.binary) } +export type ToServerWebSocketMessageAck = { + readonly index: u16 +} + +export function readToServerWebSocketMessageAck(bc: bare.ByteCursor): ToServerWebSocketMessageAck { + return { + index: bare.readU16(bc), + } +} + +export function writeToServerWebSocketMessageAck(bc: bare.ByteCursor, x: ToServerWebSocketMessageAck): void { + bare.writeU16(bc, x.index) +} + export type ToServerWebSocketClose = { readonly code: u16 | null readonly reason: string | null + readonly retry: boolean } export function readToServerWebSocketClose(bc: bare.ByteCursor): ToServerWebSocketClose { return { code: read9(bc), reason: read5(bc), + retry: bare.readBool(bc), } } export function writeToServerWebSocketClose(bc: bare.ByteCursor, x: ToServerWebSocketClose): void { write9(bc, x.code) write5(bc, x.reason) + bare.writeBool(bc, x.retry) } /** @@ -1159,6 +1194,7 @@ export type ToServerTunnelMessageKind = */ | { readonly tag: "ToServerWebSocketOpen"; readonly val: ToServerWebSocketOpen } | { readonly tag: "ToServerWebSocketMessage"; readonly val: ToServerWebSocketMessage } + | { readonly tag: "ToServerWebSocketMessageAck"; readonly val: ToServerWebSocketMessageAck } | { readonly tag: "ToServerWebSocketClose"; readonly val: ToServerWebSocketClose } export function readToServerTunnelMessageKind(bc: bare.ByteCursor): ToServerTunnelMessageKind { @@ -1174,10 +1210,12 @@ export function readToServerTunnelMessageKind(bc: bare.ByteCursor): ToServerTunn case 3: return { tag: "ToServerResponseAbort", val: null } case 4: - return { tag: "ToServerWebSocketOpen", val: null } + return { tag: "ToServerWebSocketOpen", val: readToServerWebSocketOpen(bc) } case 5: return { tag: "ToServerWebSocketMessage", val: readToServerWebSocketMessage(bc) } case 6: + return { tag: "ToServerWebSocketMessageAck", val: readToServerWebSocketMessageAck(bc) } + case 7: return { tag: "ToServerWebSocketClose", val: readToServerWebSocketClose(bc) } default: { bc.offset = offset @@ -1208,6 +1246,7 @@ export function writeToServerTunnelMessageKind(bc: bare.ByteCursor, x: ToServerT } case "ToServerWebSocketOpen": { bare.writeU8(bc, 4) + writeToServerWebSocketOpen(bc, x.val) break } case "ToServerWebSocketMessage": { @@ -1215,8 +1254,13 @@ export function writeToServerTunnelMessageKind(bc: bare.ByteCursor, x: ToServerT writeToServerWebSocketMessage(bc, x.val) break } - case "ToServerWebSocketClose": { + case "ToServerWebSocketMessageAck": { bare.writeU8(bc, 6) + writeToServerWebSocketMessageAck(bc, x.val) + break + } + case "ToServerWebSocketClose": { + bare.writeU8(bc, 7) writeToServerWebSocketClose(bc, x.val) break } diff --git a/engine/sdks/typescript/runner/src/mod.ts b/engine/sdks/typescript/runner/src/mod.ts index 8f9d2ff2c8..4603cc35f7 100644 --- a/engine/sdks/typescript/runner/src/mod.ts +++ b/engine/sdks/typescript/runner/src/mod.ts @@ -8,7 +8,7 @@ import { importWebSocket } from "./websocket.js"; import type { WebSocketTunnelAdapter } from "./websocket-tunnel-adapter"; const KV_EXPIRE: number = 30_000; -const PROTOCOL_VERSION: number = 1; +const PROTOCOL_VERSION: number = 2; /** Warn once the backlog significantly exceeds the server's ack batch size. */ const EVENT_BACKLOG_WARN_THRESHOLD = 10_000; @@ -62,9 +62,15 @@ export interface RunnerConfig { config: ActorConfig, ) => Promise; onActorStop: (actorId: string, generation: number) => Promise; + getActorHibernationConfig: (actorId: string, requestId: ArrayBuffer) => HibernationConfig; noAutoShutdown?: boolean; } +export interface HibernationConfig { + enabled: boolean; + lastMsgIndex: number | undefined; +} + export interface KvListOptions { reverse?: boolean; limit?: number; @@ -155,9 +161,6 @@ export class Runner { const actor = this.#removeActor(actorId, generation); if (!actor) return; - // Unregister actor from tunnel - this.#tunnel?.unregisterActor(actor); - // If onActorStop times out, Pegboard will handle this timeout with ACTOR_STOP_THRESHOLD_DURATION_MS try { await this.#config.onActorStop(actorId, actor.generation); @@ -246,23 +249,8 @@ export class Runner { this.#actors.delete(actorId); - // Close all WebSocket connections for this actor - const actorWebSockets = this.#actorWebSockets.get(actorId); - if (actorWebSockets) { - for (const ws of actorWebSockets) { - try { - ws.close(1000, "Actor stopped"); - } catch (err) { - logger()?.error({ - msg: "error closing websocket for actor", - runnerId: this.runnerId, - actorId, - err, - }); - } - } - this.#actorWebSockets.delete(actorId); - } + // Unregister actor from tunnel + this.#tunnel?.unregisterActor(actor); return actor; } @@ -1390,6 +1378,10 @@ export class Runner { } } + sendWebsocketMessageAck(requestId: ArrayBuffer, index: number) { + this.#tunnel?.__ackWebsocketMessage(requestId, index); + } + getServerlessInitPacket(): string | undefined { if (!this.runnerId) return undefined; diff --git a/engine/sdks/typescript/runner/src/tunnel.ts b/engine/sdks/typescript/runner/src/tunnel.ts index 3e9dfc24e2..9882341bc1 100644 --- a/engine/sdks/typescript/runner/src/tunnel.ts +++ b/engine/sdks/typescript/runner/src/tunnel.ts @@ -1,6 +1,6 @@ import type * as protocol from "@rivetkit/engine-runner-protocol"; import type { MessageId, RequestId } from "@rivetkit/engine-runner-protocol"; -import { v4 as uuidv4 } from "uuid"; +import { v4 as uuidv4, stringify as uuidstringify } from "uuid"; import { logger } from "./log"; import type { ActorInstance, Runner } from "./mod"; import { unreachable } from "./utils"; @@ -8,6 +8,7 @@ import { WebSocketTunnelAdapter } from "./websocket-tunnel-adapter"; const GC_INTERVAL = 60000; // 60 seconds const MESSAGE_ACK_TIMEOUT = 5000; // 5 seconds +const WEBSOCKET_STATE_PERSIST_TIMEOUT = 30000; // 30 seconds interface PendingRequest { resolve: (response: Response) => void; @@ -56,7 +57,7 @@ export class Tunnel { // Close all WebSockets for (const [_, ws] of this.#actorWebSockets) { - ws.close(); + ws.__closeWithRetry(); } this.#actorWebSockets.clear(); } @@ -108,6 +109,12 @@ export class Tunnel { }, }; + logger()?.debug({ + msg: "ack tunnel msg", + requestId: uuidstringify(new Uint8Array(requestId)), + messageId: uuidstringify(new Uint8Array(messageId)), + }); + this.#runner.__sendToServer(message); } @@ -156,7 +163,7 @@ export class Tunnel { const webSocket = this.#actorWebSockets.get(requestIdStr); if (webSocket) { // Close the WebSocket connection - webSocket.close(1000, "Message acknowledgment timeout"); + webSocket.__closeWithRetry(1000, "Message acknowledgment timeout"); // Clean up from actorWebSockets map this.#actorWebSockets.delete(requestIdStr); @@ -189,11 +196,11 @@ export class Tunnel { } actor.requests.clear(); - // Close all WebSockets for this actor + // Flush acks and close all WebSockets for this actor for (const webSocketId of actor.webSockets) { const ws = this.#actorWebSockets.get(webSocketId); if (ws) { - ws.close(1000, "Actor stopped"); + ws.__closeWithRetry(1000, "Actor stopped"); this.#actorWebSockets.delete(webSocketId); } } @@ -224,6 +231,13 @@ export class Tunnel { } async handleTunnelMessage(message: protocol.ToClientTunnelMessage) { + logger()?.debug({ + msg: "tunnel msg", + requestId: uuidstringify(new Uint8Array(message.requestId)), + messageId: uuidstringify(new Uint8Array(message.messageId)), + message: message.messageKind, + }); + if (message.messageKind.tag === "TunnelAck") { // Mark pending message as acknowledged and remove it const msgIdStr = bufferToString(message.messageId); @@ -232,36 +246,47 @@ export class Tunnel { this.#pendingTunnelMessages.delete(msgIdStr); } } else { - this.#sendAck(message.requestId, message.messageId); switch (message.messageKind.tag) { case "ToClientRequestStart": + this.#sendAck(message.requestId, message.messageId); + await this.#handleRequestStart( message.requestId, message.messageKind.val, ); break; case "ToClientRequestChunk": + this.#sendAck(message.requestId, message.messageId); + await this.#handleRequestChunk( message.requestId, message.messageKind.val, ); break; case "ToClientRequestAbort": + this.#sendAck(message.requestId, message.messageId); + await this.#handleRequestAbort(message.requestId); break; case "ToClientWebSocketOpen": + this.#sendAck(message.requestId, message.messageId); + await this.#handleWebSocketOpen( message.requestId, message.messageKind.val, ); break; case "ToClientWebSocketMessage": - await this.#handleWebSocketMessage( + this.#sendAck(message.requestId, message.messageId); + + let _unhandled = await this.#handleWebSocketMessage( message.requestId, message.messageKind.val, ); break; case "ToClientWebSocketClose": + this.#sendAck(message.requestId, message.messageId); + await this.#handleWebSocketClose( message.requestId, message.messageKind.val, @@ -311,8 +336,8 @@ export class Tunnel { existing.actorId = req.actorId; } else { this.#actorPendingRequests.set(requestIdStr, { - resolve: () => {}, - reject: () => {}, + resolve: () => { }, + reject: () => { }, streamController: controller, actorId: req.actorId, }); @@ -443,6 +468,7 @@ export class Tunnel { val: { code: 1011, reason: "Actor not found", + retry: false, }, }); return; @@ -460,6 +486,7 @@ export class Tunnel { val: { code: 1011, reason: "Not Implemented", + retry: false, }, }); return; @@ -479,7 +506,7 @@ export class Tunnel { const dataBuffer = typeof data === "string" ? (new TextEncoder().encode(data) - .buffer as ArrayBuffer) + .buffer as ArrayBuffer) : data; this.#sendMessage(requestId, { @@ -490,13 +517,14 @@ export class Tunnel { }, }); }, - (code?: number, reason?: string) => { + (code?: number, reason?: string, retry: boolean = false) => { // Send close through tunnel this.#sendMessage(requestId, { tag: "ToServerWebSocketClose", val: { code: code || null, reason: reason || null, + retry, }, }); @@ -514,13 +542,17 @@ export class Tunnel { this.#actorWebSockets.set(webSocketId, adapter); // Send open confirmation + let hibernationConfig = this.#runner.config.getActorHibernationConfig(actor.actorId, requestId); this.#sendMessage(requestId, { tag: "ToServerWebSocketOpen", - val: null, + val: { + canHibernate: hibernationConfig.enabled, + lastMsgIndex: BigInt(hibernationConfig.lastMsgIndex ?? -1), + }, }); // Notify adapter that connection is open - adapter._handleOpen(); + adapter._handleOpen(requestId); // Create a minimal request object for the websocket handler // Include original headers from the open message @@ -557,6 +589,7 @@ export class Tunnel { val: { code: 1011, reason: "Server Error", + retry: false, }, }); @@ -569,10 +602,11 @@ export class Tunnel { } } + /// Returns false if the message was sent off async #handleWebSocketMessage( requestId: ArrayBuffer, - msg: protocol.ToServerWebSocketMessage, - ) { + msg: protocol.ToClientWebSocketMessage, + ): Promise { const webSocketId = bufferToString(requestId); const adapter = this.#actorWebSockets.get(webSocketId); if (adapter) { @@ -580,18 +614,39 @@ export class Tunnel { ? new Uint8Array(msg.data) : new TextDecoder().decode(new Uint8Array(msg.data)); - adapter._handleMessage(data, msg.binary); + return adapter._handleMessage(requestId, data, msg.index, msg.binary); + } else { + return true; } } + __ackWebsocketMessage(requestId: ArrayBuffer, index: number) { + logger()?.debug({ + msg: "ack ws msg", + requestId: uuidstringify(new Uint8Array(requestId)), + index, + }); + + if (index < 0 || index > 65535) throw new Error("invalid websocket ack index"); + + // Send the ack message + this.#sendMessage(requestId, { + tag: "ToServerWebSocketMessageAck", + val: { + index, + }, + }); + } + async #handleWebSocketClose( requestId: ArrayBuffer, - close: protocol.ToServerWebSocketClose, + close: protocol.ToClientWebSocketClose, ) { const webSocketId = bufferToString(requestId); const adapter = this.#actorWebSockets.get(webSocketId); if (adapter) { adapter._handleClose( + requestId, close.code || undefined, close.reason || undefined, ); diff --git a/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts b/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts index eb46758d94..2fd46085b2 100644 --- a/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts +++ b/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts @@ -17,7 +17,7 @@ export class WebSocketTunnelAdapter { #protocol = ""; #url = ""; #sendCallback: (data: ArrayBuffer | string, isBinary: boolean) => void; - #closeCallback: (code?: number, reason?: string) => void; + #closeCallback: (code?: number, reason?: string, retry?: boolean) => void; // Event buffering for events fired before listeners are attached #bufferedEvents: Array<{ @@ -28,7 +28,7 @@ export class WebSocketTunnelAdapter { constructor( webSocketId: string, sendCallback: (data: ArrayBuffer | string, isBinary: boolean) => void, - closeCallback: (code?: number, reason?: string) => void, + closeCallback: (code?: number, reason?: string, retry?: boolean) => void, ) { this.#webSocketId = webSocketId; this.#sendCallback = sendCallback; @@ -186,6 +186,14 @@ export class WebSocketTunnelAdapter { } close(code?: number, reason?: string): void { + this.closeInner(code, reason); + } + + __closeWithRetry(code?: number, reason?: string): void { + this.closeInner(code, reason, true); + } + + closeInner(code?: number, reason?: string, retry: boolean = false): void { if ( this.#readyState === 2 || // CLOSING this.#readyState === 3 // CLOSED @@ -196,7 +204,7 @@ export class WebSocketTunnelAdapter { this.#readyState = 2; // CLOSING // Send close through tunnel - this.#closeCallback(code, reason); + this.#closeCallback(code, reason, retry); // Update state and fire event this.#readyState = 3; // CLOSED @@ -410,7 +418,7 @@ export class WebSocketTunnelAdapter { } // Internal methods called by the Tunnel class - _handleOpen(): void { + _handleOpen(requestId: ArrayBuffer): void { if (this.#readyState !== 0) { // CONNECTING return; @@ -420,16 +428,18 @@ export class WebSocketTunnelAdapter { const event = { type: "open", + rivetRequestId: requestId, target: this, }; this.#fireEvent("open", event); } - _handleMessage(data: string | Uint8Array, isBinary: boolean): void { + /// Returns false if the message was sent off. + _handleMessage(requestId: ArrayBuffer, data: string | Uint8Array, index: number, isBinary: boolean): boolean { if (this.#readyState !== 1) { // OPEN - return; + return true; } let messageData: any; @@ -460,15 +470,19 @@ export class WebSocketTunnelAdapter { } const event = { - data: messageData, type: "message", + data: messageData, + rivetRequestId: requestId, + rivetMessageIndex: index, target: this, }; this.#fireEvent("message", event); + + return false; } - _handleClose(code?: number, reason?: string): void { + _handleClose(requestId: ArrayBuffer, code?: number, reason?: string): void { if (this.#readyState === 3) { // CLOSED return; @@ -477,10 +491,11 @@ export class WebSocketTunnelAdapter { this.#readyState = 3; // CLOSED const event = { + type: "close", wasClean: true, code: code || 1000, reason: reason || "", - type: "close", + rivetRequestId: requestId, target: this, }; diff --git a/engine/sdks/typescript/test-runner/src/index.ts b/engine/sdks/typescript/test-runner/src/index.ts index 50aa20c899..1d9815abc3 100644 --- a/engine/sdks/typescript/test-runner/src/index.ts +++ b/engine/sdks/typescript/test-runner/src/index.ts @@ -13,7 +13,7 @@ const INTERNAL_SERVER_PORT = process.env.INTERNAL_SERVER_PORT const RIVET_NAMESPACE = process.env.RIVET_NAMESPACE ?? "default"; const RIVET_RUNNER_NAME = process.env.RIVET_RUNNER_NAME ?? "test-runner"; const RIVET_RUNNER_KEY = - process.env.RIVET_RUNNER_KEY ?? `key-${Math.floor(Math.random() * 10000)}`; + process.env.RIVET_RUNNER_KEY; const RIVET_RUNNER_VERSION = process.env.RIVET_RUNNER_VERSION ? Number(process.env.RIVET_RUNNER_VERSION) : 1; @@ -28,7 +28,7 @@ const AUTOSTART_RUNNER = process.env.NO_AUTOSTART_RUNNER === undefined; let runnerStarted = Promise.withResolvers(); let runnerStopped = Promise.withResolvers(); let runner: Runner | null = null; -const actorWebSockets = new Map(); +const websocketLastMsgIndexes: Map = new Map(); // Create internal server const app = new Hono(); @@ -94,8 +94,6 @@ app.get("/start", async (c) => { }); }); -await autoConfigureServerless(); - if (AUTOSTART_SERVER) { serve({ fetch: app.fetch, @@ -106,8 +104,10 @@ if (AUTOSTART_SERVER) { ); } -if (AUTOSTART_RUNNER) +if (AUTOSTART_RUNNER) { [runner, runnerStarted, runnerStopped] = await startRunner(); +} +else await autoConfigureServerless(); async function autoConfigureServerless() { const res = await fetch( @@ -155,13 +155,13 @@ async function startRunner(): Promise< token: RIVET_TOKEN, namespace: RIVET_NAMESPACE, runnerName: RIVET_RUNNER_NAME, - runnerKey: RIVET_RUNNER_KEY, + runnerKey: RIVET_RUNNER_KEY ?? `key-${Math.floor(Math.random() * 10000)}`, totalSlots: RIVET_RUNNER_TOTAL_SLOTS, prepopulateActorNames: {}, onConnected: () => { runnerStarted.resolve(undefined); }, - onDisconnected: () => {}, + onDisconnected: () => { }, onShutdown: () => { runnerStopped.resolve(undefined); }, @@ -208,13 +208,12 @@ async function startRunner(): Promise< ); }, websocket: async ( - _runner: Runner, + runner: Runner, actorId: string, ws: WebSocket, request: Request, ) => { getLogger().info(`WebSocket connected for actor ${actorId}`); - actorWebSockets.set(actorId, ws); // Echo server - send back any messages received ws.addEventListener("message", (event) => { @@ -222,13 +221,19 @@ async function startRunner(): Promise< getLogger().info({ msg: `WebSocket message from actor ${actorId}`, data, + index: (event as any).rivetMessageIndex, }); + ws.send(`Echo: ${data}`); + + // Ack + const websocketId = Buffer.from((event as any).rivetRequestId).toString("base64"); + websocketLastMsgIndexes.set(websocketId, (event as any).rivetMessageIndex); + runner.sendWebsocketMessageAck((event as any).rivetRequestId, (event as any).rivetMessageIndex); }); ws.addEventListener("close", () => { getLogger().info(`WebSocket closed for actor ${actorId}`); - actorWebSockets.delete(actorId); }); ws.addEventListener("error", (error) => { @@ -238,6 +243,13 @@ async function startRunner(): Promise< }); }); }, + getActorHibernationConfig(actorId, requestId) { + const websocketId = Buffer.from(requestId).toString("base64"); + return { + enabled: true, + lastMsgIndex: websocketLastMsgIndexes.get(websocketId), + }; + }, }; const runner = new Runner(config); diff --git a/scripts/tests/actor_sleep.ts b/scripts/tests/actor_sleep.ts index e39765b47a..8c34a68626 100755 --- a/scripts/tests/actor_sleep.ts +++ b/scripts/tests/actor_sleep.ts @@ -13,10 +13,12 @@ async function main() { // Create an actor console.log("Creating actor..."); - const actorResponse = await getOrCreateActor(RIVET_NAMESPACE, "test-runner", "key"); + const actorResponse = await getOrCreateActor(RIVET_NAMESPACE, "test-runner", "key3"); console.log("Actor created:", actorResponse.actor); for (let i = 0; i < 10; i++) { + await testWebSocket(actorResponse.actor.actor_id); + console.log("Sleeping actor..."); const actorSleepResponse = await fetch(`${RIVET_ENDPOINT}/sleep`, { method: "GET", @@ -38,7 +40,6 @@ async function main() { // await new Promise(resolve => setTimeout(resolve, 2000)); } - // Make a request to the actor console.log("Making request to actor..."); const actorPingResponse = await fetch(`${RIVET_ENDPOINT}/ping`, { @@ -59,8 +60,6 @@ async function main() { } console.log("Actor ping response:", pingResult); - - // await testWebSocket(actorResponse.actor.actor_id); } catch (error) { console.error(`Actor test failed:`, error); } @@ -89,14 +88,6 @@ function testWebSocket(actorId: string): Promise { let pingReceived = false; let echoReceived = false; - const timeout = setTimeout(() => { - console.log( - "No response received within timeout, but connection was established", - ); - // Connection was established, that's enough for the test - ws.close(); - resolve(); - }, 2000); ws.addEventListener("open", () => { console.log("WebSocket connected"); @@ -126,21 +117,18 @@ function testWebSocket(actorId: string): Promise { console.log("Echo test successful!"); // All tests passed - clearTimeout(timeout); ws.close(); resolve(); } }); - ws.addEventListener("error", (error) => { - clearTimeout(timeout); - reject(new Error(`WebSocket error: ${error.message}`)); + ws.addEventListener("error", (event) => { + reject(new Error(`WebSocket error: ${event}`)); }); - ws.addEventListener("close", () => { - clearTimeout(timeout); + ws.addEventListener("close", event => { if (!pingReceived || !echoReceived) { - reject(new Error("WebSocket closed before completing tests")); + reject(new Error(`WebSocket closed before completing tests: ${event.code} (${event.reason}) ${new Date().toISOString()}`)); } }); });