diff --git a/crocochrome.go b/crocochrome.go index 419557e..5425518 100644 --- a/crocochrome.go +++ b/crocochrome.go @@ -311,7 +311,20 @@ func (s *Supervisor) launch(ctx context.Context, sessionID string) error { s.metrics.SessionDuration.Observe(time.Since(created).Seconds()) }() - err = cmd.Run() + // Start process, start monitor, then wait for it to finish. + err = func() error { + err := cmd.Start() + if err != nil { + return fmt.Errorf("starting chromium: %w", err) + } + + err = cmd.Wait() + if err != nil { + return fmt.Errorf("running chromium: %w", err) + } + + return nil + }() attrs := make([]slog.Attr, 0, 9) @@ -320,6 +333,12 @@ func (s *Supervisor) launch(ctx context.Context, sessionID string) error { } if cmd.ProcessState != nil { + if rUsage, isSyscallRusage := cmd.ProcessState.SysUsage().(*syscall.Rusage); rUsage != nil && isSyscallRusage { + s.metrics.ChromiumResources.With(map[string]string{ + metrics.Resource: metrics.ResourceRSS, + }).Observe(float64(rUsage.Maxrss * 1024)) // Convert from KiB to Bytes, as it is conventional in metrics. + } + attrs = append(attrs, slog.Attr{Key: "pid", Value: slog.IntValue(cmd.ProcessState.Pid())}, slog.Attr{Key: "exitCode", Value: slog.IntValue(cmd.ProcessState.ExitCode())}, diff --git a/metrics/metrics.go b/metrics/metrics.go index 3de65c2..c72cf3e 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -15,6 +15,9 @@ const ( ExecutionState = "state" ExecutionStateFinished = "finished" ExecutionStateFailed = "failed" + + Resource = "resource" + ResourceRSS = "rss" ) // InstrumentHTTP uses promhttp to instrument a handler with total, duration, and in-flight requests. @@ -53,6 +56,7 @@ func InstrumentHTTP(reg prometheus.Registerer, handler http.Handler) http.Handle type SupervisorMetrics struct { SessionDuration prometheus.Histogram ChromiumExecutions *prometheus.CounterVec + ChromiumResources *prometheus.HistogramVec } // Supervisor registers and returns handlers for metrics used by the supervisor. @@ -81,10 +85,25 @@ func Supervisor(reg prometheus.Registerer) *SupervisorMetrics { }, []string{ExecutionState}, ), + ChromiumResources: prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: metricNs, + Subsystem: metricSubsystemCrocochrome, + Name: "chromium_resource_usage", + Help: "Resources used by chromium when the execution ends." + + "Memory resources are expressed in bytes.", + Buckets: prometheus.LinearBuckets(0, 64<<20, 16), // 64Mi*16=1024Mi + NativeHistogramBucketFactor: 1.2, + NativeHistogramMaxBucketNumber: 32, + NativeHistogramMinResetDuration: 1 * time.Hour, + }, + []string{Resource}, + ), } reg.MustRegister(m.SessionDuration) reg.MustRegister(m.ChromiumExecutions) + reg.MustRegister(m.ChromiumResources) return m } diff --git a/psutil/psutil.go b/psutil/psutil.go new file mode 100644 index 0000000..6239ca1 --- /dev/null +++ b/psutil/psutil.go @@ -0,0 +1,143 @@ +package psutil + +import ( + "fmt" + "io/fs" + "os" + "path/filepath" + "strconv" + "strings" + "time" +) + +type Stats struct { + RSSBytes int64 +} + +func (s Stats) Aggregate(other Stats) Stats { + s.RSSBytes += other.RSSBytes + + return s +} + +// PollStats periodically polls procfs for pid and its descendants, aggregating stats for all of them. The returned channel +// will be sent what PollStats considers valid snapshots of the process tree, that is, where no errors traversing it were +// detected. When `pid` stops existing, PollStats closes the channel. +func PollStats(pid int) <-chan Stats { + ch := make(chan Stats) + go func() { + defer close(ch) + + for { + time.Sleep(100 * time.Millisecond) + + treeStats := make([]Stats, 0, 8) + mainstats, err := StatsFor(pid) + if err != nil { + // Stats for the main PID are not readable, it is either gone or we lack permissions. Return. + return + } + + treeStats = append(treeStats, mainstats) + ptree, err := Tree(pid) + if err != nil { + // Found an error traversing children, assuming process tree changed while we were scanning it. Retry. + continue + } + + for _, child := range ptree[1:] { // First item is the main process, specialcased above. + stats, err := StatsFor(child) + if err != nil { + // Found an error fetching child stats, assuming process tree changed while we were scanning it. Retry. + continue + } + + treeStats = append(treeStats, stats) + } + + aggregatedStats := Stats{} + for _, stats := range treeStats { + aggregatedStats = aggregatedStats.Aggregate(stats) + } + + ch <- aggregatedStats + } + }() + + return ch +} + +func StatsFor(pid int) (Stats, error) { + mem, err := os.ReadFile(filepath.Join(procPath(pid), "mem")) + if err != nil { + return Stats{}, fmt.Errorf("reading stats: %w", err) + } + + // TODO: This is a mock. + rss, err := strconv.ParseInt(string(mem), 10, 64) + if err != nil { + return Stats{}, fmt.Errorf("parsing memory: %w", err) + } + + return Stats{ + RSSBytes: rss, + }, nil +} + +// Tree returns a list of PIDs for pid and its children, recursively. +// The first PID on the returned slice is pid. +func Tree(pid int) ([]int, error) { + pids := make([]int, 0, 8) + + // Add yourself. + pids = append(pids, pid) + + // For each children, recurse. + children, err := ChildrenOf(pid) + if err != nil { + return nil, err + } + for _, child := range children { + childTree, err := Tree(child) + if err != nil { + return nil, err + } + + pids = append(pids, childTree...) + } + + return pids, nil +} + +// ChildrenOf returns the immediate children of pid. Grandchildren and pid itself are not returned, and thus the +// returned slice can be empty for a process that do not have children. +func ChildrenOf(pid int) ([]int, error) { + children := make([]int, 0, 8) + err := fs.WalkDir(os.DirFS(procPath(pid)), "task", func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + + taskChildren, err := os.ReadFile(filepath.Join(path, "children")) + if err != nil { + return fmt.Errorf("reading children file: %w", err) + } + + for _, childStr := range strings.Split(string(taskChildren), " ") { + child, err := strconv.Atoi(childStr) + if err != nil { + return fmt.Errorf("parsing pid %q: %w", childStr, err) + } + + children = append(children, int(child)) + } + + return nil + }) + + return children, err +} + +func procPath(pid int) string { + return filepath.Join("/", "proc", strconv.Itoa(pid)) +}