Skip to content

Commit bbb5a58

Browse files
committed
fix: Add endpoint_name fallback for orphaned K8s resource cleanup
When deleting endpoints, if the DB record doesn't exist but K8s resources do (orphaned resources), we now support looking up by endpoint_name as a fallback when endpoint_id lookup fails. This handles cases where the endpoint_id passed might actually be an endpoint_name (e.g., mcpx-endpoint-{uuid}). Changes: - Add _get_deployment_by_endpoint_name_label method to query K8s by endpoint_name - Update _cleanup_orphaned_k8s_resources to accept optional endpoint_name parameter - Add fallback logic in DeleteModelEndpointByIdV1UseCase to try endpoint_name when model_endpoint_id looks like an endpoint_name
1 parent 841b6c1 commit bbb5a58

File tree

3 files changed

+63
-2
lines changed

3 files changed

+63
-2
lines changed

model-engine/model_engine_server/domain/use_cases/model_endpoint_use_cases.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -609,8 +609,17 @@ async def execute(self, user: User, model_endpoint_id: str) -> DeleteModelEndpoi
609609
)
610610

611611
if isinstance(self.model_endpoint_service, LiveModelEndpointService):
612+
# If model_endpoint_id looks like an endpoint_name (e.g., "mcpx-endpoint-{uuid}"),
613+
# try it as endpoint_name fallback
614+
endpoint_name = None
615+
if model_endpoint_id.startswith("mcpx-endpoint-"):
616+
endpoint_name = model_endpoint_id
617+
# Try to extract endpoint_id from it (though we don't have it)
618+
# Will rely on endpoint_name lookup
619+
model_endpoint_id = model_endpoint_id # Keep as-is for logging
620+
612621
owner = await self.model_endpoint_service._cleanup_orphaned_k8s_resources(
613-
model_endpoint_id
622+
model_endpoint_id, endpoint_name=endpoint_name
614623
)
615624
if owner is not None:
616625
# Verify authorization - user must match owner (created_by from K8s labels)

model-engine/model_engine_server/infra/gateways/resources/k8s_endpoint_resource_delegate.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,6 +1191,34 @@ async def _get_deployment_by_endpoint_id_label(endpoint_id: str) -> Optional[V1D
11911191
logger.exception(f"Error querying deployments by endpoint_id label {endpoint_id}")
11921192
raise
11931193

1194+
@staticmethod
1195+
async def _get_deployment_by_endpoint_name_label(endpoint_name: str) -> Optional[V1Deployment]:
1196+
"""
1197+
Gets a Deployment by querying K8s with endpoint_name label selector.
1198+
Used when DB record doesn't exist but K8s resources might (orphaned resources).
1199+
1200+
Args:
1201+
endpoint_name: The endpoint_name to search for (e.g., "mcpx-endpoint-{deploymentID}")
1202+
1203+
Returns:
1204+
The first deployment found with matching endpoint_name label, or None if not found
1205+
"""
1206+
apps_client = get_kubernetes_apps_client()
1207+
label_selector = f"endpoint_name={endpoint_name}"
1208+
try:
1209+
deployments = await apps_client.list_namespaced_deployment(
1210+
namespace=hmi_config.endpoint_namespace,
1211+
label_selector=label_selector,
1212+
)
1213+
if deployments.items:
1214+
return deployments.items[0]
1215+
return None
1216+
except ApiException as e:
1217+
if e.status == 404:
1218+
return None
1219+
logger.exception(f"Error querying deployments by endpoint_name label {endpoint_name}")
1220+
raise
1221+
11941222
@staticmethod
11951223
async def _determine_endpoint_type_from_k8s(endpoint_id: str) -> ModelEndpointType:
11961224
"""

model-engine/model_engine_server/infra/services/live_model_endpoint_service.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,13 +410,16 @@ async def delete_model_endpoint(self, model_endpoint_id: str) -> None:
410410

411411
logger.info(f"Endpoint delete released lock for {created_by}, {name}")
412412

413-
async def _cleanup_orphaned_k8s_resources(self, endpoint_id: str) -> Optional[str]:
413+
async def _cleanup_orphaned_k8s_resources(
414+
self, endpoint_id: str, endpoint_name: Optional[str] = None
415+
) -> Optional[str]:
414416
"""
415417
Cleans up orphaned K8s resources when DB record doesn't exist.
416418
Returns the owner (created_by) from K8s labels if resources were found, None otherwise.
417419
418420
Args:
419421
endpoint_id: The endpoint_id to check for orphaned resources
422+
endpoint_name: Optional endpoint_name to try if endpoint_id lookup fails
420423
421424
Returns:
422425
The owner (created_by) from K8s labels if resources found, None otherwise
@@ -425,6 +428,27 @@ async def _cleanup_orphaned_k8s_resources(self, endpoint_id: str) -> Optional[st
425428
deployment = await K8SEndpointResourceDelegate._get_deployment_by_endpoint_id_label(
426429
endpoint_id
427430
)
431+
# Fallback: try endpoint_name if endpoint_id lookup fails
432+
if deployment is None and endpoint_name:
433+
logger.info(
434+
f"endpoint_id {endpoint_id} not found, trying endpoint_name {endpoint_name}"
435+
)
436+
try:
437+
deployment = await K8SEndpointResourceDelegate._get_deployment_by_endpoint_name_label(
438+
endpoint_name
439+
)
440+
# If found by endpoint_name, extract endpoint_id from labels
441+
if deployment:
442+
labels = deployment.metadata.labels or {}
443+
endpoint_id = labels.get("endpoint_id", endpoint_id)
444+
logger.info(
445+
f"Found deployment by endpoint_name {endpoint_name}, extracted endpoint_id {endpoint_id}"
446+
)
447+
else:
448+
logger.info(f"No deployment found by endpoint_name {endpoint_name}")
449+
except Exception as e:
450+
logger.exception(f"Error looking up deployment by endpoint_name {endpoint_name}: {e}")
451+
raise
428452
if deployment is None:
429453
return None
430454

0 commit comments

Comments
 (0)