mirror of
https://github.com/yunionio/cloudpods.git
synced 2026-06-20 09:32:13 +08:00
feat(region,host,climc): stop container forcely by kill parent process (#21535)
This commit is contained in:
@@ -131,7 +131,8 @@ type ContainerListInput struct {
|
||||
}
|
||||
|
||||
type ContainerStopInput struct {
|
||||
Timeout int `json:"timeout"`
|
||||
Timeout int `json:"timeout"`
|
||||
Force bool `json:"force"`
|
||||
}
|
||||
|
||||
type ContainerSyncStatusResponse struct {
|
||||
|
||||
@@ -132,4 +132,5 @@ type ContainerStopInput struct {
|
||||
Timeout int64 `json:"timeout"`
|
||||
ShmSizeMB int `json:"shm_size_mb"`
|
||||
ContainerName string `json:"container_name"`
|
||||
Force bool `json:"force"`
|
||||
}
|
||||
|
||||
@@ -29,11 +29,11 @@ type SAlert struct {
|
||||
apis.SStatusStandaloneResourceBase
|
||||
SMonitorScopedResource
|
||||
// Frequency is evaluate period
|
||||
Frequency int64 `json:"frequency"`
|
||||
Frequency int64 `json:"frequency"`
|
||||
Settings *AlertSetting `json:"settings"`
|
||||
Level string `json:"level"`
|
||||
Message string `json:"message"`
|
||||
UsedBy string `json:"used_by"`
|
||||
Level string `json:"level"`
|
||||
Message string `json:"message"`
|
||||
UsedBy string `json:"used_by"`
|
||||
// Silenced bool
|
||||
ExecutionError string `json:"execution_error"`
|
||||
// If an alert rule has a configured `For` and the query violates the configured threshold
|
||||
|
||||
@@ -510,8 +510,13 @@ func (c *SContainer) StartStartTask(ctx context.Context, userCred mcclient.Token
|
||||
}
|
||||
|
||||
func (c *SContainer) PerformStop(ctx context.Context, userCred mcclient.TokenCredential, query jsonutils.JSONObject, data *api.ContainerStopInput) (jsonutils.JSONObject, error) {
|
||||
if !sets.NewString(api.CONTAINER_STATUS_RUNNING, api.CONTAINER_STATUS_STOP_FAILED).Has(c.Status) {
|
||||
return nil, httperrors.NewInvalidStatusError("Can't stop container in status %s", c.Status)
|
||||
if !data.Force {
|
||||
if !sets.NewString(
|
||||
api.CONTAINER_STATUS_RUNNING,
|
||||
api.CONTAINER_STATUS_PROBING,
|
||||
api.CONTAINER_STATUS_STOP_FAILED).Has(c.Status) {
|
||||
return nil, httperrors.NewInvalidStatusError("Can't stop container in status %s", c.Status)
|
||||
}
|
||||
}
|
||||
return nil, c.StartStopTask(ctx, userCred, data, "")
|
||||
}
|
||||
|
||||
@@ -1156,7 +1156,7 @@ func (s *sPodGuestInstance) StopContainer(ctx context.Context, userCred mcclient
|
||||
return nil, errors.Wrapf(err, "unmount shm %s", name)
|
||||
}
|
||||
}
|
||||
if err := s.getCRI().StopContainer(ctx, criId, timeout, true); err != nil {
|
||||
if err := s.getCRI().StopContainer(ctx, criId, timeout, true, input.Force); err != nil {
|
||||
if !IsContainerNotFoundError(err) {
|
||||
return nil, errors.Wrap(err, "CRI.StopContainer")
|
||||
} else {
|
||||
|
||||
@@ -11,10 +11,11 @@
|
||||
package apis
|
||||
|
||||
import (
|
||||
protoreflect "google.golang.org/protobuf/reflect/protoreflect"
|
||||
protoimpl "google.golang.org/protobuf/runtime/protoimpl"
|
||||
reflect "reflect"
|
||||
sync "sync"
|
||||
|
||||
protoreflect "google.golang.org/protobuf/reflect/protoreflect"
|
||||
protoimpl "google.golang.org/protobuf/runtime/protoimpl"
|
||||
)
|
||||
|
||||
const (
|
||||
|
||||
@@ -8,6 +8,7 @@ package apis
|
||||
|
||||
import (
|
||||
context "context"
|
||||
|
||||
grpc "google.golang.org/grpc"
|
||||
codes "google.golang.org/grpc/codes"
|
||||
status "google.golang.org/grpc/status"
|
||||
|
||||
@@ -290,7 +290,8 @@ func (o *ContainerIdsOptions) Params() (jsonutils.JSONObject, error) {
|
||||
|
||||
type ContainerStopOptions struct {
|
||||
ContainerIdsOptions
|
||||
Timeout int `help:"Stopping timeout" json:"timeout"`
|
||||
Timeout int `help:"Stopping timeout" json:"timeout"`
|
||||
Force bool `help:"Force stop container" json:"force"`
|
||||
}
|
||||
|
||||
func (o *ContainerStopOptions) Params() (jsonutils.JSONObject, error) {
|
||||
|
||||
@@ -17,6 +17,8 @@ package pod
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -24,8 +26,11 @@ import (
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
|
||||
"yunion.io/x/jsonutils"
|
||||
"yunion.io/x/log"
|
||||
"yunion.io/x/pkg/errors"
|
||||
|
||||
"yunion.io/x/onecloud/pkg/util/procutils"
|
||||
)
|
||||
|
||||
type CRI interface {
|
||||
@@ -36,7 +41,7 @@ type CRI interface {
|
||||
RemovePod(ctx context.Context, podId string) error
|
||||
CreateContainer(ctx context.Context, podId string, podConfig *runtimeapi.PodSandboxConfig, ctrConfig *runtimeapi.ContainerConfig, withPull bool) (string, error)
|
||||
StartContainer(ctx context.Context, id string) error
|
||||
StopContainer(ctx context.Context, ctrId string, timeout int64, tryRemove bool) error
|
||||
StopContainer(ctx context.Context, ctrId string, timeout int64, tryRemove bool, force bool) error
|
||||
RemoveContainer(ctx context.Context, ctrId string) error
|
||||
RunContainers(ctx context.Context, podConfig *runtimeapi.PodSandboxConfig, containerConfigs []*runtimeapi.ContainerConfig, runtimeHandler string) (*RunContainersResponse, error)
|
||||
ListContainers(ctx context.Context, opts ListContainerOptions) ([]*runtimeapi.Container, error)
|
||||
@@ -330,8 +335,7 @@ func (c crictl) RemovePod(ctx context.Context, podId string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c crictl) StopContainer(ctx context.Context, ctrId string, timeout int64, tryRemove bool) error {
|
||||
maxTries := 10
|
||||
func (c crictl) stopContainerWithRetry(ctx context.Context, ctrId string, timeout int64, maxTries int) error {
|
||||
interval := 5 * time.Second
|
||||
errs := []error{}
|
||||
for tries := 0; tries < maxTries; tries++ {
|
||||
@@ -342,6 +346,9 @@ func (c crictl) StopContainer(ctx context.Context, ctrId string, timeout int64,
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
if strings.Contains(err.Error(), "code = NotFound") {
|
||||
return nil
|
||||
}
|
||||
dur := interval * time.Duration(tries+1)
|
||||
log.Warningf("try to restop container %s after %s, timeout: %d: %v", ctrId, dur, timeout, err)
|
||||
// set timeout to 0 to stop forcely
|
||||
@@ -349,6 +356,24 @@ func (c crictl) StopContainer(ctx context.Context, ctrId string, timeout int64,
|
||||
errs = append(errs, errors.Wrapf(err, "try %d", tries))
|
||||
time.Sleep(dur)
|
||||
}
|
||||
return errors.NewAggregate(errs)
|
||||
}
|
||||
|
||||
func (c crictl) StopContainer(ctx context.Context, ctrId string, timeout int64, tryRemove bool, force bool) error {
|
||||
maxTries := 10
|
||||
errs := []error{}
|
||||
err := c.stopContainerWithRetry(ctx, ctrId, timeout, maxTries)
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
errs = append(errs, err)
|
||||
if force {
|
||||
if err := c.forceKillContainer(ctx, ctrId); err != nil {
|
||||
errs = append(errs, errors.Wrap(err, "forceKillContainer"))
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
if tryRemove {
|
||||
// try force remove container
|
||||
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
@@ -372,6 +397,45 @@ func (c crictl) stopContainer(ctx context.Context, ctrId string, timeout int64)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c crictl) forceKillContainer(ctx context.Context, ctrId string) error {
|
||||
cs, err := c.containerStatus(ctx, ctrId, true)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "get containerStatus")
|
||||
}
|
||||
info := cs.GetInfo()
|
||||
pid, ok := info["pid"]
|
||||
if !ok {
|
||||
return errors.Errorf("not found pid from info %s", jsonutils.Marshal(info))
|
||||
}
|
||||
// get ppid
|
||||
pStatusFile := filepath.Join("/proc", pid, "task", pid, "status")
|
||||
out, err := procutils.NewRemoteCommandAsFarAsPossible("sh", "-c", fmt.Sprintf("cat %s | grep PPid: | awk '{print $2}'", pStatusFile)).Output()
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "get ppid from %s, out: %s", pStatusFile, out)
|
||||
}
|
||||
ppidStr := strings.TrimSpace(string(out))
|
||||
ppid, err := strconv.Atoi(ppidStr)
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "invalid ppid str %s from %s", ppidStr, pStatusFile)
|
||||
}
|
||||
ppCmdlineFile := filepath.Join("/proc", ppidStr, "cmdline")
|
||||
ppCmdline, err := procutils.NewRemoteCommandAsFarAsPossible("cat", ppCmdlineFile).Output()
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "get cmdline from %s, out: %s", ppCmdlineFile, ppCmdline)
|
||||
}
|
||||
log.Infof("try to kill container %s, pid %s parent process(%d): %s", ctrId, pid, ppid, ppCmdline)
|
||||
killOut, err := procutils.NewRemoteCommandAsFarAsPossible("kill", "-9", ppidStr).Output()
|
||||
if err != nil {
|
||||
killErr := errors.Wrapf(err, "kill -9 %s, out: %s", ppidStr, killOut)
|
||||
log.Errorf("kill container %s, pid %s parent process(%d): %s, error: %v", ctrId, pid, ppid, ppCmdline, killErr)
|
||||
return killErr
|
||||
}
|
||||
if err := c.stopContainerWithRetry(ctx, ctrId, 0, 5); err != nil {
|
||||
return errors.Wrapf(err, "stop container %s after kill parent process", ctrId)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c crictl) RemoveContainer(ctx context.Context, ctrId string) error {
|
||||
_, err := c.GetRuntimeClient().RemoveContainer(ctx, &runtimeapi.RemoveContainerRequest{
|
||||
ContainerId: ctrId,
|
||||
@@ -383,9 +447,13 @@ func (c crictl) RemoveContainer(ctx context.Context, ctrId string) error {
|
||||
}
|
||||
|
||||
func (c crictl) ContainerStatus(ctx context.Context, ctrId string) (*runtimeapi.ContainerStatusResponse, error) {
|
||||
return c.containerStatus(ctx, ctrId, false)
|
||||
}
|
||||
|
||||
func (c crictl) containerStatus(ctx context.Context, ctrId string, verbose bool) (*runtimeapi.ContainerStatusResponse, error) {
|
||||
req := &runtimeapi.ContainerStatusRequest{
|
||||
ContainerId: ctrId,
|
||||
Verbose: false,
|
||||
Verbose: verbose,
|
||||
}
|
||||
return c.GetRuntimeClient().ContainerStatus(ctx, req)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user