@@ -261,6 +261,7 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re
261261 self .running .insert (0 , preempted_req )
262262 continue
263263 preempted_req .status = RequestStatus .PREEMPTED
264+ preempted_req .last_preempted_blocksize = len (preempted_req .block_tables )
264265 preempted_req .num_computed_tokens = 0
265266 if self .config .scheduler_config .splitwise_role == "decode" :
266267 self .tasks_list [preempted_req .idx ] = None
@@ -735,6 +736,14 @@ def _allocate_decode_and_extend():
735736 break
736737 num_new_tokens = self ._get_num_new_tokens (request , token_budget )
737738 num_new_block = self .get_new_block_nums (request , num_new_tokens )
739+ # If num_new_block is less than the last preempted block size, use the last preempted block size instead.
740+ # For normal requests, when allocating blocks, we reserve two extra blocks for decoding.
741+ # In the request rescheduling scenario, we currently only consider the number of tokens already generated,
742+ # which might lead to allocating fewer blocks than the previous allocation, causing repeated rescheduling.
743+ # This adjustment ensures we at least allocate as many blocks as before to avoid this issue.
744+ last_preempted_blocksize = getattr (request , "last_preempted_blocksize" , 0 )
745+ if num_new_block < last_preempted_blocksize :
746+ num_new_block = last_preempted_blocksize
738747 # Allocate blocks to prefill
739748 if self .cache_manager .can_allocate_gpu_blocks (num_new_block ):
740749 if not request .get ("skip_allocate" , False ):
0 commit comments