Skip to content

Commit f8caf24

Browse files
committed
fix: update incorrect metrics in cache
Signed-off-by: CYJiang <[email protected]>
1 parent f12a6a7 commit f8caf24

File tree

5 files changed

+174
-103
lines changed

5 files changed

+174
-103
lines changed

pkg/cache/cache_metrics.go

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -353,17 +353,21 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily,
353353
klog.V(4).Infof(err.Error())
354354
engineType = defaultEngineLabelValue
355355
}
356-
rawMetricName, ok := metric.EngineMetricsNameMapping[engineType]
357-
if !ok {
358-
klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName)
356+
candidates, ok := metric.EngineMetricsNameMapping[engineType]
357+
if !ok || len(candidates) == 0 {
358+
klog.V(4).Infof("No metric name candidates found for engine type %v and metric %v", engineType, labelMetricName)
359359
return nil, false
360360
}
361-
metricFamily, exists := allMetrics[rawMetricName]
362-
if !exists {
363-
klog.V(4).Infof("Cannot find raw metrics %v, engine type %v", rawMetricName, engineType)
364-
return nil, false
361+
362+
for _, rawMetricName := range candidates {
363+
if metricFamily, exists := allMetrics[rawMetricName]; exists {
364+
return metricFamily, true
365+
}
365366
}
366-
return metricFamily, true
367+
368+
klog.V(4).Infof("None of the candidate raw metrics %v found for engine %v and metric %v", candidates, engineType, labelMetricName)
369+
return nil, false
370+
367371
}
368372

369373
// Update `PodMetrics` and `PodModelMetrics` according to the metric scope

pkg/metrics/engine_fetcher.go

Lines changed: 51 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,14 @@ func (ef *EngineMetricsFetcher) FetchTypedMetric(ctx context.Context, endpoint,
100100
return nil, fmt.Errorf("metric %s is not a raw pod metric, use FetchAllTypedMetrics for complex queries", metricName)
101101
}
102102

103-
// Get raw metric name for this engine
104-
rawMetricName, exists := metricDef.EngineMetricsNameMapping[engineType]
105-
if !exists {
103+
// Get raw metric name candidates for this engine
104+
candidates, exists := metricDef.EngineMetricsNameMapping[engineType]
105+
if !exists || len(candidates) == 0 {
106106
return nil, fmt.Errorf("metric %s not supported for engine type %s", metricName, engineType)
107107
}
108108

109109
url := fmt.Sprintf("http://%s/metrics", endpoint)
110-
110+
var lastErr error
111111
// Fetch with retry logic
112112
for attempt := 0; attempt <= ef.config.MaxRetries; attempt++ {
113113
if attempt > 0 {
@@ -130,21 +130,37 @@ func (ef *EngineMetricsFetcher) FetchTypedMetric(ctx context.Context, endpoint,
130130
continue
131131
}
132132

133-
// Parse the specific metric we need
134-
metricValue, err := ef.parseMetricFromFamily(allMetrics, rawMetricName, metricDef)
135-
if err != nil {
136-
klog.V(4).InfoS("Failed to parse metric from engine endpoint",
137-
"attempt", attempt+1, "identifier", identifier, "metric", metricName, "error", err)
138-
continue
133+
// Try each candidate until one exists and can be parsed
134+
for _, rawMetricName := range candidates {
135+
if _, ok := allMetrics[rawMetricName]; !ok {
136+
continue // skip if not present
137+
}
138+
139+
metricValue, err := ef.parseMetricFromFamily(allMetrics, rawMetricName, metricDef)
140+
if err != nil {
141+
lastErr = err
142+
klog.V(5).InfoS("Failed to parse candidate metric", "candidate", rawMetricName, "error", err)
143+
continue
144+
}
145+
146+
klog.V(4).InfoS("Successfully fetched typed metric from engine endpoint",
147+
"identifier", identifier, "metric", metricName, "rawMetric", rawMetricName, "value", metricValue, "attempt", attempt+1)
148+
return metricValue, nil
139149
}
140150

141-
klog.V(4).InfoS("Successfully fetched typed metric from engine endpoint",
142-
"identifier", identifier, "metric", metricName, "value", metricValue, "attempt", attempt+1)
143-
return metricValue, nil
151+
klog.V(4).InfoS("Failed to find valid metric among candidates",
152+
"candidates", candidates, "identifier", identifier, "metric", metricName)
153+
// Continue to next retry if any
144154
}
145155

146-
return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts",
147-
metricName, identifier, ef.config.MaxRetries+1)
156+
// If we get here, none of the candidates worked
157+
errMsg := "none of the candidate metric names found or parsed successfully"
158+
if lastErr != nil {
159+
errMsg += ": " + lastErr.Error()
160+
}
161+
162+
return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts: %w",
163+
metricName, identifier, ef.config.MaxRetries+1, lastErr)
148164
}
149165

150166
// FetchAllTypedMetrics fetches all available typed metrics from an engine endpoint
@@ -215,10 +231,26 @@ func (ef *EngineMetricsFetcher) FetchAllTypedMetrics(ctx context.Context, endpoi
215231
continue
216232
}
217233

218-
// Get raw metric name for this engine
219-
rawMetricName, exists := metricDef.EngineMetricsNameMapping[result.EngineType]
220-
if !exists {
221-
klog.V(5).InfoS("Metric not supported for engine type", "metric", metricName, "engine", result.EngineType)
234+
// Get raw metric name candidates for this engine
235+
candidates, exists := metricDef.EngineMetricsNameMapping[result.EngineType]
236+
if !exists || len(candidates) == 0 {
237+
klog.V(5).InfoS("No raw metric names defined for metric and engine type",
238+
"metric", metricName, "engine", result.EngineType)
239+
continue
240+
}
241+
242+
// Find the first candidate that exists in allMetrics
243+
var rawMetricName string
244+
for _, name := range candidates {
245+
if _, ok := allMetrics[name]; ok {
246+
rawMetricName = name
247+
break
248+
}
249+
}
250+
251+
if rawMetricName == "" {
252+
klog.V(5).InfoS("None of the candidate raw metrics found in endpoint response",
253+
"metric", metricName, "engine", result.EngineType, "candidates", candidates)
222254
continue
223255
}
224256

pkg/metrics/engine_fetcher_test.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@ func setupMockMetrics() {
7777
Metrics["running_requests"] = Metric{
7878
MetricSource: PodRawMetrics,
7979
MetricType: MetricType{Raw: Gauge},
80-
EngineMetricsNameMapping: map[string]string{
81-
"vllm": "vllm_num_requests_running",
82-
"sglang": "sglang_running_requests",
80+
EngineMetricsNameMapping: map[string][]string{
81+
"vllm": {"vllm_num_requests_running"},
82+
"sglang": {"sglang_running_requests"},
8383
},
8484
Description: "Number of running requests",
8585
MetricScope: PodModelMetricScope,
@@ -88,9 +88,9 @@ func setupMockMetrics() {
8888
Metrics["waiting_requests"] = Metric{
8989
MetricSource: PodRawMetrics,
9090
MetricType: MetricType{Raw: Gauge},
91-
EngineMetricsNameMapping: map[string]string{
92-
"vllm": "vllm_num_requests_waiting",
93-
"sglang": "sglang_waiting_requests",
91+
EngineMetricsNameMapping: map[string][]string{
92+
"vllm": {"vllm_num_requests_waiting"},
93+
"sglang": {"sglang_waiting_requests"},
9494
},
9595
Description: "Number of waiting requests",
9696
MetricScope: PodModelMetricScope,
@@ -99,9 +99,9 @@ func setupMockMetrics() {
9999
Metrics["cache_usage"] = Metric{
100100
MetricSource: PodRawMetrics,
101101
MetricType: MetricType{Raw: Gauge},
102-
EngineMetricsNameMapping: map[string]string{
103-
"vllm": "vllm_gpu_cache_usage_perc",
104-
"sglang": "sglang_cache_usage",
102+
EngineMetricsNameMapping: map[string][]string{
103+
"vllm": {"vllm_gpu_cache_usage_perc"},
104+
"sglang": {"sglang_cache_usage"},
105105
},
106106
Description: "Cache usage percentage",
107107
MetricScope: PodMetricScope,
@@ -110,8 +110,8 @@ func setupMockMetrics() {
110110
Metrics["time_to_first_token"] = Metric{
111111
MetricSource: PodRawMetrics,
112112
MetricType: MetricType{Raw: Histogram},
113-
EngineMetricsNameMapping: map[string]string{
114-
"vllm": "vllm_time_to_first_token_seconds",
113+
EngineMetricsNameMapping: map[string][]string{
114+
"vllm": {"vllm_time_to_first_token_seconds"},
115115
},
116116
Description: "Time to first token histogram",
117117
MetricScope: PodModelMetricScope,

0 commit comments

Comments
 (0)