Skip to content

Commit a1abac6

Browse files
committed
feat (postStart) : Allow debugging poststart failures with sleep by trapping errors
Add an optional debug mechanism for postStart lifecycle hooks. When enabled via the `controller.devfile.io/debug-start: "true"` annotation, any failure in a postStart command results in the container sleeping for some seconds as per configured progressTimeout, allowing developers time to inspect the container state. - Added `enableDebugStart` parameter to poststart methods. - Injects `trap ... sleep` into postStart scripts when debug mode is enabled. - Includes support for both timeout-wrapped (`postStartTimeout`) and non-timeout lifecycle scripts. This feature improves debuggability of DevWorkspaces where postStart hooks fail and would otherwise cause container crash/restarts. Signed-off-by: Rohan Kumar <[email protected]>
1 parent 7861627 commit a1abac6

18 files changed

+325
-34
lines changed

controllers/workspace/devworkspace_controller.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,12 +323,17 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request
323323
}
324324
}
325325

326+
postStartDebugTrapSleepDuration := ""
327+
if workspace.Annotations[constants.DevWorkspaceDebugStartAnnotation] == "true" {
328+
postStartDebugTrapSleepDuration = workspace.Config.Workspace.ProgressTimeout
329+
}
326330
devfilePodAdditions, err := containerlib.GetKubeContainersFromDevfile(
327331
&workspace.Spec.Template,
328332
workspace.Config.Workspace.ContainerSecurityContext,
329333
workspace.Config.Workspace.ImagePullPolicy,
330334
workspace.Config.Workspace.DefaultContainerResources,
331335
workspace.Config.Workspace.PostStartTimeout,
336+
postStartDebugTrapSleepDuration,
332337
)
333338
if err != nil {
334339
return r.failWorkspace(workspace, fmt.Sprintf("Error processing devfile: %s", err), metrics.ReasonBadRequest, reqLogger, &reconcileStatus), nil

docs/additional-configuration.adoc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,14 @@ The DevWorkspace Operator sets the `volumeMounts` by default for config files, m
348348
## Debugging a failing workspace
349349
Normally, when a workspace fails to start, the deployment will be scaled down and the workspace will be stopped in a `Failed` state. This can make it difficult to debug misconfiguration errors, so the annotation `controller.devfile.io/debug-start: "true"` can be applied to DevWorkspaces to leave resources for failed workspaces on the cluster. This allows viewing logs from workspace containers.
350350

351+
It also enables a specialized debug mode for `postStart` lifecycle hooks, which are often used for initial setup tasks.
352+
353+
When a postStart command fails:
354+
- The container will not immediately crash or restart. It would stay in `ContainerCreating` phase.
355+
- The command failure is trapped, and the container is instead forced to sleep for some seconds as per configured DevWorkspace progressTimeout (by default, 5 minutes).
356+
357+
This trap sleep pause is a critical window that allows developers to connect to the container (e.g., using `kubectl exec`), inspect the file system, and review logs `/tmp/poststart-stderr.txt` / `/tmp/poststart-stdout.txt` to diagnose the exact cause of the postStart failure before the workspace ultimately scales down. This applies to both standard and timeout-wrapped postStart scripts.
358+
351359
## Setting RuntimeClass for workspace pods
352360
To run a DevWorkspace with a specific RuntimeClass, the attribute `controller.devfile.io/runtime-class` can be set on the DevWorkspace with the name of the RuntimeClass to be used. If the specified RuntimeClass does not exist, the workspace will fail to start. For example, to run a DevWorkspace using the https://github.com/kata-containers/kata-containers[kata containers] runtime in clusters where this is enabled, the DevWorkspace can be specified:
353361
[source,yaml]

pkg/library/container/container.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ import (
4545
// rewritten as Volumes are added to PodAdditions, in order to support e.g. using one PVC to hold all volumes
4646
//
4747
// Note: Requires DevWorkspace to be flattened (i.e. the DevWorkspace contains no Parent or Components of type Plugin)
48-
func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements, postStartTimeout string) (*v1alpha1.PodAdditions, error) {
48+
func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements, postStartTimeout string, postStartDebugTrapSleepDuration string) (*v1alpha1.PodAdditions, error) {
4949
if !flatten.DevWorkspaceIsFlattened(workspace, nil) {
5050
return nil, fmt.Errorf("devfile is not flattened")
5151
}
@@ -77,7 +77,7 @@ func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securi
7777
podAdditions.Containers = append(podAdditions.Containers, *k8sContainer)
7878
}
7979

80-
if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers, postStartTimeout); err != nil {
80+
if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers, postStartTimeout, postStartDebugTrapSleepDuration); err != nil {
8181
return nil, err
8282
}
8383

pkg/library/container/container_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ func TestGetKubeContainersFromDevfile(t *testing.T) {
8787
t.Run(tt.Name, func(t *testing.T) {
8888
// sanity check that file is read correctly.
8989
assert.True(t, len(tt.Input.Components) > 0, "Input defines no components")
90-
gotPodAdditions, err := GetKubeContainersFromDevfile(tt.Input, nil, testImagePullPolicy, defaultResources, "")
90+
gotPodAdditions, err := GetKubeContainersFromDevfile(tt.Input, nil, testImagePullPolicy, defaultResources, "", "")
9191
if tt.Output.ErrRegexp != nil && assert.Error(t, err) {
9292
assert.Regexp(t, *tt.Output.ErrRegexp, err.Error(), "Error message should match")
9393
} else {

pkg/library/lifecycle/poststart.go

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,12 @@ package lifecycle
1515

1616
import (
1717
"fmt"
18+
"regexp"
1819
"strings"
1920
"time"
2021

22+
"github.com/go-logr/logr"
23+
2124
dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2"
2225
corev1 "k8s.io/api/core/v1"
2326
"sigs.k8s.io/controller-runtime/pkg/log"
@@ -41,7 +44,9 @@ const (
4144
`
4245
)
4346

44-
func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string) error {
47+
var trapErrRegex = regexp.MustCompile(`\btrap\b.*\bERR\b`)
48+
49+
func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string, postStartDebugTrapSleepDuration string) error {
4550
if wksp.Events == nil || len(wksp.Events.PostStart) == 0 {
4651
return nil
4752
}
@@ -69,7 +74,7 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []
6974
return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err)
7075
}
7176

72-
postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout)
77+
postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout, postStartDebugTrapSleepDuration)
7378
if err != nil {
7479
return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err)
7580
}
@@ -85,10 +90,10 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []
8590

8691
// processCommandsForPostStart processes a list of DevWorkspace commands
8792
// and generates a corev1.LifecycleHandler for the PostStart lifecycle hook.
88-
func processCommandsForPostStart(commands []dw.Command, postStartTimeout string) (*corev1.LifecycleHandler, error) {
93+
func processCommandsForPostStart(commands []dw.Command, postStartTimeout string, postStartDebugTrapSleepDuration string) (*corev1.LifecycleHandler, error) {
8994
if postStartTimeout == "" {
9095
// use the fallback if no timeout propagated
91-
return processCommandsWithoutTimeoutFallback(commands)
96+
return processCommandsWithoutTimeoutFallback(commands, postStartDebugTrapSleepDuration)
9297
}
9398

9499
originalUserScript, err := buildUserScript(commands)
@@ -101,7 +106,7 @@ func processCommandsForPostStart(commands []dw.Command, postStartTimeout string)
101106
scriptToExecute := "set -e\n" + originalUserScript
102107
escapedUserScriptForTimeoutWrapper := strings.ReplaceAll(scriptToExecute, "'", `'\''`)
103108

104-
fullScriptWithTimeout := generateScriptWithTimeout(escapedUserScriptForTimeoutWrapper, postStartTimeout)
109+
fullScriptWithTimeout := generateScriptWithTimeout(escapedUserScriptForTimeoutWrapper, postStartTimeout, postStartDebugTrapSleepDuration)
105110

106111
finalScriptForHook := fmt.Sprintf(redirectOutputFmt, fullScriptWithTimeout)
107112

@@ -128,8 +133,10 @@ func processCommandsForPostStart(commands []dw.Command, postStartTimeout string)
128133
// - |
129134
// cd <workingDir>
130135
// <commandline>
131-
func processCommandsWithoutTimeoutFallback(commands []dw.Command) (*corev1.LifecycleHandler, error) {
136+
func processCommandsWithoutTimeoutFallback(commands []dw.Command, postStartDebugTrapSleepDuration string) (*corev1.LifecycleHandler, error) {
132137
var dwCommands []string
138+
postStartFailureDebugSleepSeconds := parsePostStartFailureDebugSleepDurationToSeconds(log.Log, postStartDebugTrapSleepDuration)
139+
hasErrTrapInUserScript := false
133140
for _, command := range commands {
134141
execCmd := command.Exec
135142
if len(execCmd.Env) > 0 {
@@ -139,6 +146,21 @@ func processCommandsWithoutTimeoutFallback(commands []dw.Command) (*corev1.Lifec
139146
dwCommands = append(dwCommands, fmt.Sprintf("cd %s", execCmd.WorkingDir))
140147
}
141148
dwCommands = append(dwCommands, execCmd.CommandLine)
149+
if trapErrRegex.MatchString(execCmd.CommandLine) {
150+
hasErrTrapInUserScript = true
151+
}
152+
}
153+
154+
if postStartFailureDebugSleepSeconds > 0 && !hasErrTrapInUserScript {
155+
debugTrap := fmt.Sprintf(`
156+
trap 'echo "[postStart] failure encountered, sleep for debugging"; sleep %d' ERR
157+
`, postStartFailureDebugSleepSeconds)
158+
debugTrapLine := strings.ReplaceAll(strings.TrimSpace(debugTrap), "\n", " ")
159+
160+
dwCommands = append([]string{
161+
"set -e",
162+
debugTrapLine,
163+
}, dwCommands...)
142164
}
143165

144166
joinedCommands := strings.Join(dwCommands, "\n")
@@ -187,7 +209,7 @@ func buildUserScript(commands []dw.Command) (string, error) {
187209
// environment variable exports, and specific exit code handling.
188210
// The killAfterDurationSeconds is hardcoded to 5s within this generated script.
189211
// It conditionally prefixes the user script with the timeout command if available.
190-
func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string) string {
212+
func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string, postStartDebugTrapSleepDuration string) string {
191213
// Convert `postStartTimeout` into the `timeout` format
192214
var timeoutSeconds int64
193215
if postStartTimeout != "" && postStartTimeout != "0" {
@@ -199,10 +221,12 @@ func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string
199221
timeoutSeconds = int64(duration.Seconds())
200222
}
201223
}
224+
postStartFailureDebugSleepSeconds := parsePostStartFailureDebugSleepDurationToSeconds(log.Log, postStartDebugTrapSleepDuration)
202225

203226
return fmt.Sprintf(`
204227
export POSTSTART_TIMEOUT_DURATION="%d"
205228
export POSTSTART_KILL_AFTER_DURATION="5"
229+
export DEBUG_ENABLED="%t"
206230
207231
_TIMEOUT_COMMAND_PART=""
208232
_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean
@@ -219,6 +243,11 @@ fi
219243
${_TIMEOUT_COMMAND_PART} /bin/sh -c '%s'
220244
exit_code=$?
221245
246+
if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then
247+
echo "[postStart] failure encountered, sleep for debugging" >&2
248+
sleep %d
249+
fi
250+
222251
# Check the exit code based on whether timeout was attempted
223252
if [ "$_WAS_TIMEOUT_USED" = "true" ]; then
224253
if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM)
@@ -239,5 +268,19 @@ else
239268
fi
240269
241270
exit $exit_code
242-
`, timeoutSeconds, escapedUserScript)
271+
`, timeoutSeconds, postStartFailureDebugSleepSeconds > 0, escapedUserScript, postStartFailureDebugSleepSeconds)
272+
}
273+
274+
func parsePostStartFailureDebugSleepDurationToSeconds(logger logr.Logger, durationStr string) int {
275+
if durationStr == "" {
276+
return 0
277+
}
278+
279+
d, err := time.ParseDuration(durationStr)
280+
if err != nil {
281+
logger.Error(err, "Failed to parse postStart failure debug sleep duration for ", "durationStr", durationStr)
282+
return 0
283+
}
284+
285+
return int(d.Seconds())
243286
}

0 commit comments

Comments
 (0)