Skip to content

Commit

Permalink
fix(CSI-276): allow unpublish even if publish failed with stale file …
Browse files Browse the repository at this point in the history
…handle (#356)

### TL;DR

Improved error handling and logging in the NodeServer implementation.

### What changed?

- Added handling for stale NFS mounts in `NodeUnpublishVolume`.

### How to test?

1. Provision a snapshot-backed PVC and attach it to node on a 2.5.0-beta version of CSI plugin. You might hit a "stale file handle" error that causes the pod to not be able to access the volume contents. In such case, deletion of the pod will fail either.
2. Upgrade the plugin to latest version
3. Unpublish the volume by terminating the pod. Ensure that the PVC is deleted

### Why make this change?

These changes ensure that pods with mis-attached PVCs (on version 2.5.0-beta2, 2.5.0-beta) may be detached and pods can be terminated. Without the fix, a node reboot might be required to remove the malfunctioning pods.
  • Loading branch information
sergeyberezansky authored Oct 8, 2024
2 parents 4a4150f + 5dd547a commit cf3a1bc
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions pkg/wekafs/nodeserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ package wekafs

import (
"context"
"errors"
"fmt"
"github.com/container-storage-interface/spec/lib/go/csi"
"github.com/pkg/errors"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
"go.opentelemetry.io/otel"
Expand Down Expand Up @@ -120,7 +120,7 @@ func (ns *NodeServer) NodeGetVolumeStats(ctx context.Context, req *csi.NodeGetVo

// Validate Weka volume ID
if err := validateVolumeId(volumeID); err != nil {
return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "invalid volume ID").Error())
return nil, status.Errorf(codes.InvalidArgument, "invalid volume ID %s: %v", volumeID, err)
}

stats, err := getVolumeStats(volumePath)
Expand Down Expand Up @@ -404,7 +404,7 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis

func NodeUnpublishVolumeError(ctx context.Context, errorCode codes.Code, errorMessage string) (*csi.NodeUnpublishVolumeResponse, error) {
err := status.Error(errorCode, strings.ToLower(errorMessage))
log.Ctx(ctx).Err(err).CallerSkipFrame(1).Msg("Error deleting volume")
log.Ctx(ctx).Err(err).CallerSkipFrame(1).Msg("Error unpublishing volume")
return &csi.NodeUnpublishVolumeResponse{}, err
}

Expand Down Expand Up @@ -448,6 +448,9 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
logger.Debug().Msg("Target path does not exist, assuming repeating unpublish request")
result = "SUCCESS"
return &csi.NodeUnpublishVolumeResponse{}, nil
} else if pathErr, ok := err.(*os.PathError); ok && errors.Is(pathErr.Err, syscall.ESTALE) {
logger.Debug().Msg("Target path is stale, assuming NFS publish failure")
goto FORCEUMOUNT
} else {
logger.Error().Err(err).Msg("Failed to check target path")
return NodeUnpublishVolumeError(ctx, codes.Internal, "unexpected situation, please contact support")
Expand All @@ -470,6 +473,7 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
}
}

FORCEUMOUNT:
logger.Trace().Str("target_path", targetPath).Msg("Unmounting")
if err := mount.New("").Unmount(targetPath); err != nil {
//it seems that when NodeUnpublishRequest appears, this target path is already not existing, e.g. due to pod being deleted
Expand Down

0 comments on commit cf3a1bc

Please sign in to comment.