[BugFix] reschedule_preempt_task append waiting & PREEMPTED blocksize (#5506)

ST-XX · Copilot · web-flow · commit dbedb0797b24 · 2025-12-12T17:43:29.000+08:00
* bugfix reschedule_preempt_task append waiting &amp; PREEMPTED blocksize

* bugfix reschedule_preempt_task append waiting &amp; PREEMPTED blocksize

* 注释

* [bugfix] PREEMPTED task blocksize

* Apply suggestion from @Copilot

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

---------

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -261,6 +261,7 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re
                     self.running.insert(0, preempted_req)
                     continue
                 preempted_req.status = RequestStatus.PREEMPTED
+                preempted_req.last_preempted_blocksize = len(preempted_req.block_tables)
                 preempted_req.num_computed_tokens = 0
                 if self.config.scheduler_config.splitwise_role == "decode":
                     self.tasks_list[preempted_req.idx] = None
@@ -735,6 +736,14 @@ def _allocate_decode_and_extend():
                                 break
                         num_new_tokens = self._get_num_new_tokens(request, token_budget)
                         num_new_block = self.get_new_block_nums(request, num_new_tokens)
+                        # If num_new_block is less than the last preempted block size, use the last preempted block size instead.
+                        # For normal requests, when allocating blocks, we reserve two extra blocks for decoding.
+                        # In the request rescheduling scenario, we currently only consider the number of tokens already generated,
+                        # which might lead to allocating fewer blocks than the previous allocation, causing repeated rescheduling.
+                        # This adjustment ensures we at least allocate as many blocks as before to avoid this issue.
+                        last_preempted_blocksize = getattr(request, "last_preempted_blocksize", 0)
+                        if num_new_block < last_preempted_blocksize:
+                            num_new_block = last_preempted_blocksize
                         # Allocate blocks to prefill
                         if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
                             if not request.get("skip_allocate", False):