Skip to content

Commit

Permalink
[management] Refactor HTTP metrics (#2476)
Browse files Browse the repository at this point in the history
* Add logging for slow SQL queries in SaveAccount and GetAccount

* Add resource count log for large accounts

* Refactor metrics middleware to simplify counters and histograms

* Update log levels and remove redundant resource count check
  • Loading branch information
bcmmbaga authored Aug 23, 2024
1 parent 33b264e commit d97b036
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 99 deletions.
6 changes: 6 additions & 0 deletions management/server/account.go
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,12 @@ func (a *Account) GetPeerNetworkMap(
objectCount := int64(len(peersToConnect) + len(expiredPeers) + len(routesUpdate) + len(firewallRules))
metrics.CountNetworkMapObjects(objectCount)
metrics.CountGetPeerNetworkMapDuration(time.Since(start))

if objectCount > 5000 {
log.WithContext(ctx).Tracef("account: %s has a total resource count of %d objects, "+
"peers to connect: %d, expired peers: %d, routes: %d, firewall rules: %d",
a.Id, objectCount, len(peersToConnect), len(expiredPeers), len(routesUpdate), len(firewallRules))
}
}

return nm
Expand Down
21 changes: 0 additions & 21 deletions management/server/http/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,27 +100,6 @@ func APIHandler(ctx context.Context, accountManager s.AccountManager, LocationMa
api.addPostureCheckEndpoint()
api.addLocationsEndpoint()

err := api.Router.Walk(func(route *mux.Route, _ *mux.Router, _ []*mux.Route) error {
methods, err := route.GetMethods()
if err != nil { // we may have wildcard routes from integrations without methods, skip them for now
methods = []string{}
}
for _, method := range methods {
template, err := route.GetPathTemplate()
if err != nil {
return err
}
err = metricsMiddleware.AddHTTPRequestResponseCounter(template, method)
if err != nil {
return err
}
}
return nil
})
if err != nil {
return nil, err
}

return rootRouter, nil
}

Expand Down
13 changes: 13 additions & 0 deletions management/server/sql_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,12 @@ func (s *SqlStore) AcquireReadLockByUID(ctx context.Context, uniqueID string) (u

func (s *SqlStore) SaveAccount(ctx context.Context, account *Account) error {
start := time.Now()
defer func() {
elapsed := time.Since(start)
if elapsed > 1*time.Second {
log.WithContext(ctx).Tracef("SaveAccount for account %s exceeded 1s, took: %v", account.Id, elapsed)
}
}()

// todo: remove this check after the issue is resolved
s.checkAccountDomainBeforeSave(ctx, account.Id, account.Domain)
Expand Down Expand Up @@ -513,6 +519,13 @@ func (s *SqlStore) GetAllAccounts(ctx context.Context) (all []*Account) {
}

func (s *SqlStore) GetAccount(ctx context.Context, accountID string) (*Account, error) {
start := time.Now()
defer func() {
elapsed := time.Since(start)
if elapsed > 1*time.Second {
log.WithContext(ctx).Tracef("GetAccount for account %s exceeded 1s, took: %v", accountID, elapsed)
}
}()

var account Account
result := s.db.Model(&account).
Expand Down
126 changes: 48 additions & 78 deletions management/server/telemetry/http_api_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"time"

"github.com/google/uuid"
"github.com/gorilla/mux"
log "github.com/sirupsen/logrus"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
Expand Down Expand Up @@ -54,112 +55,89 @@ func (rw *WrappedResponseWriter) WriteHeader(code int) {
// HTTPMiddleware handler used to collect metrics of every request/response coming to the API.
// Also adds request tracing (logging).
type HTTPMiddleware struct {
meter metric.Meter
ctx context.Context
ctx context.Context
// all HTTP requests by endpoint & method
httpRequestCounters map[string]metric.Int64Counter
httpRequestCounter metric.Int64Counter
// all HTTP responses by endpoint & method & status code
httpResponseCounters map[string]metric.Int64Counter
httpResponseCounter metric.Int64Counter
// all HTTP requests
totalHTTPRequestsCounter metric.Int64Counter
// all HTTP responses
totalHTTPResponseCounter metric.Int64Counter
// all HTTP responses by status code
totalHTTPResponseCodeCounters map[int]metric.Int64Counter
totalHTTPResponseCodeCounter metric.Int64Counter
// all HTTP requests durations by endpoint and method
httpRequestDurations map[string]metric.Int64Histogram
httpRequestDuration metric.Int64Histogram
// all HTTP requests durations
totalHTTPRequestDuration metric.Int64Histogram
}

// NewMetricsMiddleware creates a new HTTPMiddleware
func NewMetricsMiddleware(ctx context.Context, meter metric.Meter) (*HTTPMiddleware, error) {
totalHTTPRequestsCounter, err := meter.Int64Counter(fmt.Sprintf("%s_total", httpRequestCounterPrefix), metric.WithUnit("1"))
httpRequestCounter, err := meter.Int64Counter(httpRequestCounterPrefix, metric.WithUnit("1"))
if err != nil {
return nil, err
}

totalHTTPResponseCounter, err := meter.Int64Counter(fmt.Sprintf("%s_total", httpResponseCounterPrefix), metric.WithUnit("1"))
httpResponseCounter, err := meter.Int64Counter(httpResponseCounterPrefix, metric.WithUnit("1"))
if err != nil {
return nil, err
}

totalHTTPRequestDuration, err := meter.Int64Histogram(fmt.Sprintf("%s_total", httpRequestDurationPrefix), metric.WithUnit("milliseconds"))
totalHTTPRequestsCounter, err := meter.Int64Counter(fmt.Sprintf("%s.total", httpRequestCounterPrefix), metric.WithUnit("1"))
if err != nil {
return nil, err
}

return &HTTPMiddleware{
ctx: ctx,
httpRequestCounters: map[string]metric.Int64Counter{},
httpResponseCounters: map[string]metric.Int64Counter{},
httpRequestDurations: map[string]metric.Int64Histogram{},
totalHTTPResponseCodeCounters: map[int]metric.Int64Counter{},
meter: meter,
totalHTTPRequestsCounter: totalHTTPRequestsCounter,
totalHTTPResponseCounter: totalHTTPResponseCounter,
totalHTTPRequestDuration: totalHTTPRequestDuration,
},
nil
}
totalHTTPResponseCounter, err := meter.Int64Counter(fmt.Sprintf("%s.total", httpResponseCounterPrefix), metric.WithUnit("1"))
if err != nil {
return nil, err
}

// AddHTTPRequestResponseCounter adds a new meter for an HTTP defaultEndpoint and Method (GET, POST, etc)
// Creates one request counter and multiple response counters (one per http response status code).
func (m *HTTPMiddleware) AddHTTPRequestResponseCounter(endpoint string, method string) error {
meterKey := getRequestCounterKey(endpoint, method)
httpReqCounter, err := m.meter.Int64Counter(meterKey, metric.WithUnit("1"))
totalHTTPResponseCodeCounter, err := meter.Int64Counter(fmt.Sprintf("%s.code.total", httpResponseCounterPrefix), metric.WithUnit("1"))
if err != nil {
return err
return nil, err
}
m.httpRequestCounters[meterKey] = httpReqCounter

durationKey := getRequestDurationKey(endpoint, method)
requestDuration, err := m.meter.Int64Histogram(durationKey, metric.WithUnit("milliseconds"))
httpRequestDuration, err := meter.Int64Histogram(httpRequestDurationPrefix, metric.WithUnit("milliseconds"))
if err != nil {
return err
return nil, err
}
m.httpRequestDurations[durationKey] = requestDuration

respCodes := []int{200, 204, 400, 401, 403, 404, 500, 502, 503}
for _, code := range respCodes {
meterKey = getResponseCounterKey(endpoint, method, code)
httpRespCounter, err := m.meter.Int64Counter(meterKey, metric.WithUnit("1"))
if err != nil {
return err
}
m.httpResponseCounters[meterKey] = httpRespCounter

meterKey = fmt.Sprintf("%s_%d_total", httpResponseCounterPrefix, code)
totalHTTPResponseCodeCounter, err := m.meter.Int64Counter(meterKey, metric.WithUnit("1"))
if err != nil {
return err
}
m.totalHTTPResponseCodeCounters[code] = totalHTTPResponseCodeCounter
totalHTTPRequestDuration, err := meter.Int64Histogram(fmt.Sprintf("%s.total", httpRequestDurationPrefix), metric.WithUnit("milliseconds"))
if err != nil {
return nil, err
}

return nil
return &HTTPMiddleware{
ctx: ctx,
httpRequestCounter: httpRequestCounter,
httpResponseCounter: httpResponseCounter,
httpRequestDuration: httpRequestDuration,
totalHTTPResponseCodeCounter: totalHTTPResponseCodeCounter,
totalHTTPRequestsCounter: totalHTTPRequestsCounter,
totalHTTPResponseCounter: totalHTTPResponseCounter,
totalHTTPRequestDuration: totalHTTPRequestDuration,
},
nil
}

func replaceEndpointChars(endpoint string) string {
endpoint = strings.ReplaceAll(endpoint, "/", "_")
endpoint = strings.ReplaceAll(endpoint, "{", "")
endpoint = strings.ReplaceAll(endpoint, "}", "")
return endpoint
}

func getRequestCounterKey(endpoint, method string) string {
endpoint = replaceEndpointChars(endpoint)
return fmt.Sprintf("%s%s_%s", httpRequestCounterPrefix, endpoint, method)
}

func getRequestDurationKey(endpoint, method string) string {
endpoint = replaceEndpointChars(endpoint)
return fmt.Sprintf("%s%s_%s", httpRequestDurationPrefix, endpoint, method)
}

func getResponseCounterKey(endpoint, method string, status int) string {
endpoint = replaceEndpointChars(endpoint)
return fmt.Sprintf("%s%s_%s_%d", httpResponseCounterPrefix, endpoint, method, status)
func getEndpointMetricAttr(r *http.Request) string {
var endpoint string
route := mux.CurrentRoute(r)
if route != nil {
pathTmpl, err := route.GetPathTemplate()
if err == nil {
endpoint = replaceEndpointChars(pathTmpl)
}
}
return endpoint
}

// Handler logs every request and response and adds the, to metrics.
Expand All @@ -176,11 +154,10 @@ func (m *HTTPMiddleware) Handler(h http.Handler) http.Handler {

log.WithContext(ctx).Tracef("HTTP request %v: %v %v", reqID, r.Method, r.URL)

metricKey := getRequestCounterKey(r.URL.Path, r.Method)
endpointAttr := attribute.String("endpoint", getEndpointMetricAttr(r))
methodAttr := attribute.String("method", r.Method)

if c, ok := m.httpRequestCounters[metricKey]; ok {
c.Add(m.ctx, 1)
}
m.httpRequestCounter.Add(m.ctx, 1, metric.WithAttributes(endpointAttr, methodAttr))
m.totalHTTPRequestsCounter.Add(m.ctx, 1)

w := WrapResponseWriter(rw)
Expand All @@ -193,21 +170,14 @@ func (m *HTTPMiddleware) Handler(h http.Handler) http.Handler {
log.WithContext(ctx).Tracef("HTTP response %v: %v %v status %v", reqID, r.Method, r.URL, w.Status())
}

metricKey = getResponseCounterKey(r.URL.Path, r.Method, w.Status())
if c, ok := m.httpResponseCounters[metricKey]; ok {
c.Add(m.ctx, 1)
}
statusCodeAttr := attribute.Int("code", w.Status())

m.httpResponseCounter.Add(m.ctx, 1, metric.WithAttributes(endpointAttr, methodAttr, statusCodeAttr))
m.totalHTTPResponseCounter.Add(m.ctx, 1)
if c, ok := m.totalHTTPResponseCodeCounters[w.Status()]; ok {
c.Add(m.ctx, 1)
}
m.totalHTTPResponseCodeCounter.Add(m.ctx, 1, metric.WithAttributes(statusCodeAttr))

durationKey := getRequestDurationKey(r.URL.Path, r.Method)
reqTook := time.Since(reqStart)
if c, ok := m.httpRequestDurations[durationKey]; ok {
c.Record(m.ctx, reqTook.Milliseconds())
}
m.httpRequestDuration.Record(m.ctx, reqTook.Milliseconds(), metric.WithAttributes(endpointAttr, methodAttr))
log.WithContext(ctx).Debugf("request %s %s took %d ms and finished with status %d", r.Method, r.URL.Path, reqTook.Milliseconds(), w.Status())

if w.Status() == 200 && (r.Method == http.MethodPut || r.Method == http.MethodPost || r.Method == http.MethodDelete) {
Expand Down

0 comments on commit d97b036

Please sign in to comment.