mirror of
https://github.com/dushixiang/pika.git
synced 2026-05-07 06:16:43 +08:00
fix: GPU 采集器 nvidia-smi 调用添加 15 秒超时
nvidia-smi 在 GPU 驱动异常时可能无限挂起,导致整个 metricsLoop 卡死。 使用 exec.CommandContext 为 initStatic 和 collectDynamic 中的 nvidia-smi 调用添加 15 秒超时保护。 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,15 +1,19 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/csv"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/dushixiang/pika/internal/protocol"
|
||||
)
|
||||
|
||||
const nvidiaSmiTimeout = 15 * time.Second
|
||||
|
||||
// gpuStaticInfo GPU 静态信息
|
||||
type gpuStaticInfo struct {
|
||||
Index int
|
||||
@@ -42,7 +46,9 @@ func (g *GPUCollector) initStatic() {
|
||||
}
|
||||
|
||||
// 查询静态信息: index, name, uuid, memory.total
|
||||
cmd := exec.Command("nvidia-smi",
|
||||
ctx, cancel := context.WithTimeout(context.Background(), nvidiaSmiTimeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, "nvidia-smi",
|
||||
"--query-gpu=index,name,uuid,memory.total",
|
||||
"--format=csv,noheader,nounits")
|
||||
|
||||
@@ -105,7 +111,9 @@ func (g *GPUCollector) Collect() ([]*protocol.GPUData, error) {
|
||||
func (g *GPUCollector) collectDynamic() ([]*protocol.GPUData, error) {
|
||||
// 使用 nvidia-smi 查询动态数据
|
||||
// 输出格式: index, temperature.gpu, utilization.gpu, memory.used, memory.free, power.draw, fan.speed
|
||||
cmd := exec.Command("nvidia-smi",
|
||||
ctx, cancel := context.WithTimeout(context.Background(), nvidiaSmiTimeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, "nvidia-smi",
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,memory.used,memory.free,power.draw,fan.speed",
|
||||
"--format=csv,noheader,nounits")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user