Skip to content

Commit

Permalink
Merge pull request kubernetes#55641 from sjenning/remove-corrupt-chec…
Browse files Browse the repository at this point in the history
…kpoints

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

dockershim: remove corrupt checkpoints immediately upon detection

Fixes kubernetes#55620

The current checkpoint abstraction leaks the handling of corrupt checkpoints to the user.

If the user does a `GetCheckpoint()` and the checkpoint is corrupt, the corrupt checkpoint is returned to the user (why??) with an error indicating the corruption.  It is then up to the user to detect the corruption via the error msg and call `RemoveCheckpoint()` to remove the corrupted checkpoint.

The checkpoint abstraction should not expose corruption to the user.  If it is corrupt, it is as good as not found to the user.  The checkpoint code should handle cleanup of corrupt entries and report "not found" to the user.

@derekwaynecarr @eparis @dcbw @freehan
  • Loading branch information
Kubernetes Submit Queue authored Nov 14, 2017
2 parents 48d0627 + a4bc770 commit b983cee
Show file tree
Hide file tree
Showing 9 changed files with 9 additions and 83 deletions.
1 change: 0 additions & 1 deletion hack/.golint_failures
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,6 @@ pkg/kubelet/container/testing
pkg/kubelet/custommetrics
pkg/kubelet/dockershim
pkg/kubelet/dockershim/cm
pkg/kubelet/dockershim/errors
pkg/kubelet/dockershim/libdocker
pkg/kubelet/dockershim/remote
pkg/kubelet/dockershim/testing
Expand Down
2 changes: 0 additions & 2 deletions pkg/kubelet/dockershim/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ go_library(
"//pkg/kubelet/cm:go_default_library",
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/dockershim/cm:go_default_library",
"//pkg/kubelet/dockershim/errors:go_default_library",
"//pkg/kubelet/dockershim/libdocker:go_default_library",
"//pkg/kubelet/dockershim/metrics:go_default_library",
"//pkg/kubelet/leaky:go_default_library",
Expand Down Expand Up @@ -142,7 +141,6 @@ filegroup(
srcs = [
":package-srcs",
"//pkg/kubelet/dockershim/cm:all-srcs",
"//pkg/kubelet/dockershim/errors:all-srcs",
"//pkg/kubelet/dockershim/libdocker:all-srcs",
"//pkg/kubelet/dockershim/metrics:all-srcs",
"//pkg/kubelet/dockershim/remote:all-srcs",
Expand Down
11 changes: 6 additions & 5 deletions pkg/kubelet/dockershim/docker_checkpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"path/filepath"

"github.com/golang/glog"
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
utilstore "k8s.io/kubernetes/pkg/kubelet/util/store"
utilfs "k8s.io/kubernetes/pkg/util/filesystem"
hashutil "k8s.io/kubernetes/pkg/util/hash"
Expand Down Expand Up @@ -113,12 +112,14 @@ func (handler *PersistentCheckpointHandler) GetCheckpoint(podSandboxID string) (
//TODO: unmarhsal into a struct with just Version, check version, unmarshal into versioned type.
err = json.Unmarshal(blob, &checkpoint)
if err != nil {
glog.Errorf("Failed to unmarshal checkpoint %q. Checkpoint content: %q. ErrMsg: %v", podSandboxID, string(blob), err)
return &checkpoint, errors.CorruptCheckpointError
glog.Errorf("Failed to unmarshal checkpoint %q, removing checkpoint. Checkpoint content: %q. ErrMsg: %v", podSandboxID, string(blob), err)
handler.RemoveCheckpoint(podSandboxID)
return nil, fmt.Errorf("failed to unmarshal checkpoint")
}
if checkpoint.CheckSum != calculateChecksum(checkpoint) {
glog.Errorf("Checksum of checkpoint %q is not valid", podSandboxID)
return &checkpoint, errors.CorruptCheckpointError
glog.Errorf("Checksum of checkpoint %q is not valid, removing checkpoint", podSandboxID)
handler.RemoveCheckpoint(podSandboxID)
return nil, fmt.Errorf("checkpoint is corrupted")
}
return &checkpoint, nil
}
Expand Down
20 changes: 1 addition & 19 deletions pkg/kubelet/dockershim/docker_sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ import (
utilerrors "k8s.io/apimachinery/pkg/util/errors"
runtimeapi "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/types"
Expand Down Expand Up @@ -193,20 +192,10 @@ func (ds *dockerService) StopPodSandbox(podSandboxID string) error {
// actions will only have sandbox ID and not have pod namespace and name information.
// Return error if encounter any unexpected error.
if checkpointErr != nil {
if libdocker.IsContainerNotFoundError(statusErr) && checkpointErr == errors.CheckpointNotFoundError {
if libdocker.IsContainerNotFoundError(statusErr) {
glog.Warningf("Both sandbox container and checkpoint for id %q could not be found. "+
"Proceed without further sandbox information.", podSandboxID)
} else {
if checkpointErr == errors.CorruptCheckpointError {
// Remove the corrupted checkpoint so that the next
// StopPodSandbox call can proceed. This may indicate that
// some resources won't be reclaimed.
// TODO (#43021): Fix this properly.
glog.Warningf("Removing corrupted checkpoint %q: %+v", podSandboxID, *checkpoint)
if err := ds.checkpointHandler.RemoveCheckpoint(podSandboxID); err != nil {
glog.Warningf("Unable to remove corrupted checkpoint %q: %v", podSandboxID, err)
}
}
return utilerrors.NewAggregate([]error{
fmt.Errorf("failed to get checkpoint for sandbox %q: %v", podSandboxID, checkpointErr),
fmt.Errorf("failed to get sandbox status: %v", statusErr)})
Expand Down Expand Up @@ -488,13 +477,6 @@ func (ds *dockerService) ListPodSandbox(filter *runtimeapi.PodSandboxFilter) ([]
checkpoint, err := ds.checkpointHandler.GetCheckpoint(id)
if err != nil {
glog.Errorf("Failed to retrieve checkpoint for sandbox %q: %v", id, err)

if err == errors.CorruptCheckpointError {
glog.Warningf("Removing corrupted checkpoint %q: %+v", id, *checkpoint)
if err := ds.checkpointHandler.RemoveCheckpoint(id); err != nil {
glog.Warningf("Unable to remove corrupted checkpoint %q: %v", id, err)
}
}
continue
}
result = append(result, checkpointToRuntimeAPISandbox(id, checkpoint))
Expand Down
5 changes: 0 additions & 5 deletions pkg/kubelet/dockershim/docker_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ import (
kubecm "k8s.io/kubernetes/pkg/kubelet/cm"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/dockershim/cm"
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
"k8s.io/kubernetes/pkg/kubelet/network"
"k8s.io/kubernetes/pkg/kubelet/network/cni"
"k8s.io/kubernetes/pkg/kubelet/network/hostport"
Expand Down Expand Up @@ -366,10 +365,6 @@ func (ds *dockerService) GetPodPortMappings(podSandboxID string) ([]*hostport.Po
checkpoint, err := ds.checkpointHandler.GetCheckpoint(podSandboxID)
// Return empty portMappings if checkpoint is not found
if err != nil {
if err == errors.CheckpointNotFoundError {
glog.Warningf("Failed to retrieve checkpoint for sandbox %q: %v", podSandboxID, err)
return nil, nil
}
return nil, err
}

Expand Down
25 changes: 0 additions & 25 deletions pkg/kubelet/dockershim/errors/BUILD

This file was deleted.

22 changes: 0 additions & 22 deletions pkg/kubelet/dockershim/errors/errors.go

This file was deleted.

1 change: 0 additions & 1 deletion pkg/kubelet/dockershim/testing/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ go_library(
name = "go_default_library",
srcs = ["util.go"],
importpath = "k8s.io/kubernetes/pkg/kubelet/dockershim/testing",
deps = ["//pkg/kubelet/dockershim/errors:go_default_library"],
)

filegroup(
Expand Down
5 changes: 2 additions & 3 deletions pkg/kubelet/dockershim/testing/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@ limitations under the License.
package testing

import (
"fmt"
"sync"

"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
)

// MemStore is an implementation of CheckpointStore interface which stores checkpoint in memory.
Expand All @@ -44,7 +43,7 @@ func (mstore *MemStore) Read(key string) ([]byte, error) {
defer mstore.Unlock()
data, ok := mstore.mem[key]
if !ok {
return nil, errors.CheckpointNotFoundError
return nil, fmt.Errorf("checkpoint is not found")
}
return data, nil
}
Expand Down

0 comments on commit b983cee

Please sign in to comment.