fix: GPU 采集器 nvidia-smi 调用添加 15 秒超时

nvidia-smi 在 GPU 驱动异常时可能无限挂起,导致整个 metricsLoop 卡死。
使用 exec.CommandContext 为 initStatic 和 collectDynamic 中的 nvidia-smi
调用添加 15 秒超时保护。

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
yumusb
2026-04-20 14:43:16 +08:00
parent e0c9fc5b25
commit c2e64804f3

View File

@@ -1,15 +1,19 @@
package collector
import (
"context"
"encoding/csv"
"os/exec"
"strconv"
"strings"
"sync"
"time"
"github.com/dushixiang/pika/internal/protocol"
)
const nvidiaSmiTimeout = 15 * time.Second
// gpuStaticInfo GPU 静态信息
type gpuStaticInfo struct {
Index int
@@ -42,7 +46,9 @@ func (g *GPUCollector) initStatic() {
}
// 查询静态信息: index, name, uuid, memory.total
cmd := exec.Command("nvidia-smi",
ctx, cancel := context.WithTimeout(context.Background(), nvidiaSmiTimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "nvidia-smi",
"--query-gpu=index,name,uuid,memory.total",
"--format=csv,noheader,nounits")
@@ -105,7 +111,9 @@ func (g *GPUCollector) Collect() ([]*protocol.GPUData, error) {
func (g *GPUCollector) collectDynamic() ([]*protocol.GPUData, error) {
// 使用 nvidia-smi 查询动态数据
// 输出格式: index, temperature.gpu, utilization.gpu, memory.used, memory.free, power.draw, fan.speed
cmd := exec.Command("nvidia-smi",
ctx, cancel := context.WithTimeout(context.Background(), nvidiaSmiTimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "nvidia-smi",
"--query-gpu=index,temperature.gpu,utilization.gpu,memory.used,memory.free,power.draw,fan.speed",
"--format=csv,noheader,nounits")