diff --git a/go.mod b/go.mod index 667f3d16a..239ed5153 100644 --- a/go.mod +++ b/go.mod @@ -59,6 +59,7 @@ require ( github.com/ryanuber/go-glob v1.0.0 // indirect github.com/woodsbury/decimal128 v1.4.0 // indirect github.com/zalando/go-keyring v0.2.6 // indirect + github.com/zsais/go-gin-prometheus v1.0.2 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect golang.org/x/oauth2 v0.30.0 // indirect diff --git a/go.sum b/go.sum index b3c7e1917..fb97c168b 100644 --- a/go.sum +++ b/go.sum @@ -269,6 +269,8 @@ github.com/woodsbury/decimal128 v1.4.0 h1:xJATj7lLu4f2oObouMt2tgGiElE5gO6mSWUjQs github.com/woodsbury/decimal128 v1.4.0/go.mod h1:BP46FUrVjVhdTbKT+XuQh2xfQaGki9LMIRJSFuh6THU= github.com/zalando/go-keyring v0.2.6 h1:r7Yc3+H+Ux0+M72zacZoItR3UDxeWfKTcabvkI8ua9s= github.com/zalando/go-keyring v0.2.6/go.mod h1:2TCrxYrbUNYfNS/Kgy/LSrkSQzZ5UPVH85RwfczwvcI= +github.com/zsais/go-gin-prometheus v1.0.2 h1:3asLqrFltMdItpgr/OS4hYc8pLq3HzMa5T1gYuXBIZ0= +github.com/zsais/go-gin-prometheus v1.0.2/go.mod h1:iKBYSOHzvGfe2FyGSOC8JSwUA0MITdnYzI6v+aAbw1Q= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= diff --git a/internal/controller/httpapi/router.go b/internal/controller/httpapi/router.go index e2172b121..3138f28b4 100644 --- a/internal/controller/httpapi/router.go +++ b/internal/controller/httpapi/router.go @@ -6,6 +6,7 @@ import ( "github.com/gin-gonic/gin" "github.com/prometheus/client_golang/prometheus/promhttp" + ginprometheus "github.com/zsais/go-gin-prometheus" "github.com/device-management-toolkit/console/config" v1 "github.com/device-management-toolkit/console/internal/controller/httpapi/v1" @@ -21,6 +22,13 @@ func NewRouter(handler *gin.Engine, l logger.Interface, t usecase.Usecases, cfg handler.Use(gin.Logger()) handler.Use(gin.Recovery()) + // Add Prometheus middleware for automatic HTTP metrics + // Don't automatically register /metrics endpoint - we have our own + p := ginprometheus.NewPrometheus("gin") + p.MetricsPath = "" + // Use middleware function directly without calling Use() which would register conflicting routes + handler.Use(p.HandlerFunc()) + // Initialize Fuego adapter fuegoAdapter := openapi.NewFuegoAdapter(t, l) fuegoAdapter.RegisterRoutes() diff --git a/internal/controller/ws/v1/redirect.go b/internal/controller/ws/v1/redirect.go index d917bb9b2..4063e7e49 100644 --- a/internal/controller/ws/v1/redirect.go +++ b/internal/controller/ws/v1/redirect.go @@ -3,6 +3,7 @@ package v1 import ( "compress/flate" "net/http" + "time" "github.com/gin-gonic/gin" "github.com/golang-jwt/jwt/v5" @@ -59,7 +60,13 @@ func (r *RedirectRoutes) websocketHandler(c *gin.Context) { upgrader.Subprotocols = []string{tokenString} } + // KVM_TIMING: Measure WebSocket upgrade duration + upgradeStart := time.Now() conn, err := r.u.Upgrade(c.Writer, c.Request, nil) + upgradeDuration := time.Since(upgradeStart) + devices.RecordWebsocketUpgrade(upgradeDuration) + r.l.Debug("KVM_TIMING: WebSocket upgrade", "duration_ms", upgradeDuration.Milliseconds()) + if err != nil { http.Error(c.Writer, "Could not open websocket connection", http.StatusInternalServerError) @@ -77,7 +84,13 @@ func (r *RedirectRoutes) websocketHandler(c *gin.Context) { r.l.Info("Websocket connection opened") + // KVM_TIMING: Measure total connection time + totalStart := time.Now() err = r.d.Redirect(c, conn, c.Query("host"), c.Query("mode")) + totalDuration := time.Since(totalStart) + devices.RecordTotalConnection(totalDuration, c.Query("mode")) + r.l.Debug("KVM_TIMING: Total connection time", "duration_ms", totalDuration.Milliseconds(), "mode", c.Query("mode")) + if err != nil { r.l.Error(err, "http - devices - v1 - redirect") errorResponse(c, http.StatusInternalServerError, "redirect failed") diff --git a/internal/controller/ws/v1/redirect_test.go b/internal/controller/ws/v1/redirect_test.go index 5a1db8239..e942a0080 100644 --- a/internal/controller/ws/v1/redirect_test.go +++ b/internal/controller/ws/v1/redirect_test.go @@ -65,21 +65,26 @@ func TestWebSocketHandler(t *testing.T) { //nolint:paralleltest // logging libra Upgrade(gomock.Any(), gomock.Any(), nil). Return(nil, tc.upgraderError) mockLogger.EXPECT().Debug("failed to cast Upgrader to *websocket.Upgrader") + mockLogger.EXPECT().Debug("KVM_TIMING: WebSocket upgrade", "duration_ms", gomock.Any()) } else { mockUpgrader.EXPECT(). Upgrade(gomock.Any(), gomock.Any(), nil). Return(&websocket.Conn{}, nil) mockLogger.EXPECT().Debug("failed to cast Upgrader to *websocket.Upgrader") + mockLogger.EXPECT().Debug("KVM_TIMING: WebSocket upgrade", "duration_ms", gomock.Any()) mockLogger.EXPECT().Info("Websocket connection opened") - if tc.redirectError != nil { - mockLogger.EXPECT().Error(tc.redirectError, "http - devices - v1 - redirect") - } - mockFeature.EXPECT(). Redirect(gomock.Any(), gomock.Any(), "someHost", "someMode"). Return(tc.redirectError) + + // Total connection time is always logged after Redirect completes + mockLogger.EXPECT().Debug("KVM_TIMING: Total connection time", "duration_ms", gomock.Any(), "mode", "someMode") + + if tc.redirectError != nil { + mockLogger.EXPECT().Error(tc.redirectError, "http - devices - v1 - redirect") + } } r := gin.Default() diff --git a/internal/usecase/devices/interceptor.go b/internal/usecase/devices/interceptor.go index 1fe06c9a0..2a4e31289 100644 --- a/internal/usecase/devices/interceptor.go +++ b/internal/usecase/devices/interceptor.go @@ -30,6 +30,8 @@ const ( ConnectionTimeout = 5 * time.Minute InactivityTimeout = 30 * time.Second // Close connection if no data for 30 seconds HeartbeatInterval = 30 * time.Second // Check connection health every 30 seconds + SlowReceiveThreshold = 100 // Milliseconds threshold for slow device receive + SlowWriteThreshold = 50 // Milliseconds threshold for slow write operations ) type DeviceConnection struct { @@ -48,7 +50,13 @@ type DeviceConnection struct { } func (uc *UseCase) Redirect(c context.Context, conn *websocket.Conn, guid, mode string) error { + // KVM_TIMING: Measure device lookup latency + lookupStart := time.Now() device, err := uc.repo.GetByID(c, guid, "") + + RecordDeviceLookup(time.Since(lookupStart)) + uc.log.Debug("KVM_TIMING: Device lookup", "duration_ms", time.Since(lookupStart).Milliseconds(), "guid", guid) + if err != nil { return err } @@ -64,7 +72,13 @@ func (uc *UseCase) Redirect(c context.Context, conn *websocket.Conn, guid, mode return err } + // KVM_TIMING: Measure connection setup latency + connectStart := time.Now() err = uc.redirection.RedirectConnect(c, deviceConnection) + + RecordConnectionSetup(time.Since(connectStart), mode) + uc.log.Debug("KVM_TIMING: Connection setup", "duration_ms", time.Since(connectStart).Milliseconds(), "mode", mode, "guid", guid) + if err != nil { deviceConnection.cancel() @@ -212,7 +226,12 @@ func (uc *UseCase) ListenToDevice(deviceConnection *DeviceConnection) { // Measure time blocked waiting for device data recvStart := time.Now() data, err := uc.redirection.RedirectListen(deviceConnection.ctx, deviceConnection) - kvmDeviceReceiveBlockSeconds.WithLabelValues(deviceConnection.Mode).Observe(time.Since(recvStart).Seconds()) + recvDuration := time.Since(recvStart) + kvmDeviceReceiveBlockSeconds.WithLabelValues(deviceConnection.Mode).Observe(recvDuration.Seconds()) + + if recvDuration.Milliseconds() > SlowReceiveThreshold { + uc.log.Debug("KVM_TIMING: Device receive blocked", "duration_ms", recvDuration.Milliseconds(), "mode", deviceConnection.Mode) + } if err != nil { break @@ -241,7 +260,12 @@ func (uc *UseCase) ListenToDevice(deviceConnection *DeviceConnection) { err = conn.WriteMessage(websocket.BinaryMessage, toSend) - kvmDeviceToBrowserWriteSeconds.WithLabelValues(deviceConnection.Mode).Observe(time.Since(start).Seconds()) + writeDuration := time.Since(start) + kvmDeviceToBrowserWriteSeconds.WithLabelValues(deviceConnection.Mode).Observe(writeDuration.Seconds()) + + if writeDuration.Milliseconds() > SlowWriteThreshold { + uc.log.Debug("KVM_TIMING: Device to browser write slow", "duration_ms", writeDuration.Milliseconds(), "mode", deviceConnection.Mode, "bytes", len(toSend)) + } if err != nil { if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) { @@ -273,7 +297,12 @@ func (uc *UseCase) ListenToBrowser(deviceConnection *DeviceConnection) { readStart := time.Now() _, msg, err := deviceConnection.Conn.ReadMessage() - kvmBrowserReadBlockSeconds.WithLabelValues(deviceConnection.Mode).Observe(time.Since(readStart).Seconds()) + readDuration := time.Since(readStart) + kvmBrowserReadBlockSeconds.WithLabelValues(deviceConnection.Mode).Observe(readDuration.Seconds()) + + if readDuration.Milliseconds() > SlowReceiveThreshold { + uc.log.Debug("KVM_TIMING: Browser read blocked", "duration_ms", readDuration.Milliseconds(), "mode", deviceConnection.Mode) + } if err != nil { if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) { @@ -300,7 +329,12 @@ func (uc *UseCase) ListenToBrowser(deviceConnection *DeviceConnection) { kvmBrowserToDeviceMessages.WithLabelValues(deviceConnection.Mode).Inc() // Send the message to the TCP Connection on the device err = uc.redirection.RedirectSend(deviceConnection.ctx, deviceConnection, toSend) - kvmBrowserToDeviceSendSeconds.WithLabelValues(deviceConnection.Mode).Observe(time.Since(start).Seconds()) + sendDuration := time.Since(start) + kvmBrowserToDeviceSendSeconds.WithLabelValues(deviceConnection.Mode).Observe(sendDuration.Seconds()) + + if sendDuration.Milliseconds() > SlowWriteThreshold { + uc.log.Debug("KVM_TIMING: Browser to device send slow", "duration_ms", sendDuration.Milliseconds(), "mode", deviceConnection.Mode, "bytes", len(toSend)) + } if err != nil { _ = fmt.Errorf("interceptor - listenToBrowser - error sending message to device: %w", err) diff --git a/internal/usecase/devices/metrics.go b/internal/usecase/devices/metrics.go index 849306626..b350ecd2d 100644 --- a/internal/usecase/devices/metrics.go +++ b/internal/usecase/devices/metrics.go @@ -1,6 +1,8 @@ package devices import ( + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) @@ -92,4 +94,73 @@ var ( }, []string{"mode"}, ) + + // KVM Connection Performance Metrics. + kvmDeviceLookupSeconds = promauto.NewHistogram( + prometheus.HistogramOpts{ + Name: "kvm_device_lookup_seconds", + Help: "Time to look up device from database during KVM connection (KVM_TIMING)", + Buckets: []float64{0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2}, + }, + ) + + kvmConnectionSetupSeconds = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "kvm_connection_setup_seconds", + Help: "Time to establish TCP connection to device during KVM setup (KVM_TIMING)", + Buckets: []float64{0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10}, + }, + []string{"mode"}, + ) + + kvmWebsocketUpgradeSeconds = promauto.NewHistogram( + prometheus.HistogramOpts{ + Name: "kvm_websocket_upgrade_seconds", + Help: "Time to upgrade HTTP connection to WebSocket for KVM (KVM_TIMING)", + Buckets: []float64{0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1}, + }, + ) + + kvmTotalConnectionSeconds = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "kvm_total_connection_seconds", + Help: "Total time from request to ready KVM connection (KVM_TIMING)", + Buckets: []float64{0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 30}, + }, + []string{"mode"}, + ) + + kvmConsentCodeWaitSeconds = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "kvm_consent_code_wait_seconds", + Help: "Time spent waiting for consent code handling during KVM setup (KVM_TIMING)", + Buckets: []float64{0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 30, 60}, + }, + []string{"mode"}, + ) ) + +// RecordWebsocketUpgrade records the WebSocket upgrade duration metric. +func RecordWebsocketUpgrade(duration time.Duration) { + kvmWebsocketUpgradeSeconds.Observe(duration.Seconds()) +} + +// RecordTotalConnection records the total KVM connection time metric. +func RecordTotalConnection(duration time.Duration, mode string) { + kvmTotalConnectionSeconds.WithLabelValues(mode).Observe(duration.Seconds()) +} + +// RecordConsentCodeWait records the consent code wait time metric. +func RecordConsentCodeWait(duration time.Duration, mode string) { + kvmConsentCodeWaitSeconds.WithLabelValues(mode).Observe(duration.Seconds()) +} + +// RecordDeviceLookup records the device lookup duration metric. +func RecordDeviceLookup(duration time.Duration) { + kvmDeviceLookupSeconds.Observe(duration.Seconds()) +} + +// RecordConnectionSetup records the TCP connection setup duration metric. +func RecordConnectionSetup(duration time.Duration, mode string) { + kvmConnectionSetupSeconds.WithLabelValues(mode).Observe(duration.Seconds()) +}