Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Gather memory metrics for the full chromium process tree #54

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion crocochrome.go
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,20 @@ func (s *Supervisor) launch(ctx context.Context, sessionID string) error {
s.metrics.SessionDuration.Observe(time.Since(created).Seconds())
}()

err = cmd.Run()
// Start process, start monitor, then wait for it to finish.
err = func() error {
err := cmd.Start()
if err != nil {
return fmt.Errorf("starting chromium: %w", err)
}

err = cmd.Wait()
if err != nil {
return fmt.Errorf("running chromium: %w", err)
}

return nil
}()

attrs := make([]slog.Attr, 0, 9)

Expand All @@ -320,6 +333,12 @@ func (s *Supervisor) launch(ctx context.Context, sessionID string) error {
}

if cmd.ProcessState != nil {
if rUsage, isSyscallRusage := cmd.ProcessState.SysUsage().(*syscall.Rusage); rUsage != nil && isSyscallRusage {
s.metrics.ChromiumResources.With(map[string]string{
metrics.Resource: metrics.ResourceRSS,
}).Observe(float64(rUsage.Maxrss * 1024)) // Convert from KiB to Bytes, as it is conventional in metrics.
}

attrs = append(attrs,
slog.Attr{Key: "pid", Value: slog.IntValue(cmd.ProcessState.Pid())},
slog.Attr{Key: "exitCode", Value: slog.IntValue(cmd.ProcessState.ExitCode())},
Expand Down
19 changes: 19 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ const (
ExecutionState = "state"
ExecutionStateFinished = "finished"
ExecutionStateFailed = "failed"

Resource = "resource"
ResourceRSS = "rss"
)

// InstrumentHTTP uses promhttp to instrument a handler with total, duration, and in-flight requests.
Expand Down Expand Up @@ -53,6 +56,7 @@ func InstrumentHTTP(reg prometheus.Registerer, handler http.Handler) http.Handle
type SupervisorMetrics struct {
SessionDuration prometheus.Histogram
ChromiumExecutions *prometheus.CounterVec
ChromiumResources *prometheus.HistogramVec
}

// Supervisor registers and returns handlers for metrics used by the supervisor.
Expand Down Expand Up @@ -81,10 +85,25 @@ func Supervisor(reg prometheus.Registerer) *SupervisorMetrics {
},
[]string{ExecutionState},
),
ChromiumResources: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metricNs,
Subsystem: metricSubsystemCrocochrome,
Name: "chromium_resource_usage",
Help: "Resources used by chromium when the execution ends." +
"Memory resources are expressed in bytes.",
Buckets: prometheus.LinearBuckets(0, 64<<20, 16), // 64Mi*16=1024Mi
NativeHistogramBucketFactor: 1.2,
NativeHistogramMaxBucketNumber: 32,
NativeHistogramMinResetDuration: 1 * time.Hour,
},
[]string{Resource},
),
}

reg.MustRegister(m.SessionDuration)
reg.MustRegister(m.ChromiumExecutions)
reg.MustRegister(m.ChromiumResources)

return m
}
143 changes: 143 additions & 0 deletions psutil/psutil.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
package psutil

import (
"fmt"
"io/fs"
"os"
"path/filepath"
"strconv"
"strings"
"time"
)

type Stats struct {
RSSBytes int64
}

func (s Stats) Aggregate(other Stats) Stats {
s.RSSBytes += other.RSSBytes

return s
}

// PollStats periodically polls procfs for pid and its descendants, aggregating stats for all of them. The returned channel
// will be sent what PollStats considers valid snapshots of the process tree, that is, where no errors traversing it were
// detected. When `pid` stops existing, PollStats closes the channel.
func PollStats(pid int) <-chan Stats {
ch := make(chan Stats)
go func() {
defer close(ch)

for {
time.Sleep(100 * time.Millisecond)

treeStats := make([]Stats, 0, 8)
mainstats, err := StatsFor(pid)
if err != nil {
// Stats for the main PID are not readable, it is either gone or we lack permissions. Return.
return
}

treeStats = append(treeStats, mainstats)
ptree, err := Tree(pid)
if err != nil {
// Found an error traversing children, assuming process tree changed while we were scanning it. Retry.
continue
}

for _, child := range ptree[1:] { // First item is the main process, specialcased above.
stats, err := StatsFor(child)
if err != nil {
// Found an error fetching child stats, assuming process tree changed while we were scanning it. Retry.
continue
}

treeStats = append(treeStats, stats)
}

aggregatedStats := Stats{}
for _, stats := range treeStats {
aggregatedStats = aggregatedStats.Aggregate(stats)
}

ch <- aggregatedStats
}
}()

return ch
}

func StatsFor(pid int) (Stats, error) {
mem, err := os.ReadFile(filepath.Join(procPath(pid), "mem"))
if err != nil {
return Stats{}, fmt.Errorf("reading stats: %w", err)
}

// TODO: This is a mock.
rss, err := strconv.ParseInt(string(mem), 10, 64)
if err != nil {
return Stats{}, fmt.Errorf("parsing memory: %w", err)
}

return Stats{
RSSBytes: rss,
}, nil
}

// Tree returns a list of PIDs for pid and its children, recursively.
// The first PID on the returned slice is pid.
func Tree(pid int) ([]int, error) {
pids := make([]int, 0, 8)

// Add yourself.
pids = append(pids, pid)

// For each children, recurse.
children, err := ChildrenOf(pid)
if err != nil {
return nil, err
}
for _, child := range children {
childTree, err := Tree(child)
if err != nil {
return nil, err
}

pids = append(pids, childTree...)
}

return pids, nil
}

// ChildrenOf returns the immediate children of pid. Grandchildren and pid itself are not returned, and thus the
// returned slice can be empty for a process that do not have children.
func ChildrenOf(pid int) ([]int, error) {
children := make([]int, 0, 8)
err := fs.WalkDir(os.DirFS(procPath(pid)), "task", func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}

taskChildren, err := os.ReadFile(filepath.Join(path, "children"))
if err != nil {
return fmt.Errorf("reading children file: %w", err)
}

for _, childStr := range strings.Split(string(taskChildren), " ") {
child, err := strconv.Atoi(childStr)
if err != nil {
return fmt.Errorf("parsing pid %q: %w", childStr, err)
}

children = append(children, int(child))
}

return nil
})

return children, err
}

func procPath(pid int) string {
return filepath.Join("/", "proc", strconv.Itoa(pid))
}