From 0909bb0d2f87e3d6a73a8e0dc0e38f55ce44a4d4 Mon Sep 17 00:00:00 2001
From: Ying Sheng <sqy1415@gmail.com>
Date: Tue, 13 Aug 2024 17:01:26 -0700
Subject: [PATCH] [Feat] Add window attention for gemma-2 (#1056)

---
 python/sglang/bench_latency.py                |   2 +-
 python/sglang/srt/layers/radix_attention.py   |  59 +++--
 .../srt/model_executor/forward_batch_info.py  | 203 +++++++++++++-----
 .../sglang/srt/model_executor/model_runner.py |  86 ++++++--
 python/sglang/srt/models/gemma2.py            |  16 +-
 python/sglang/srt/server_args.py              |  12 ++
 python/sglang/test/long_prompt                |   1 +
 python/sglang/test/runners.py                 |  26 ++-
 scripts/playground/reference_hf.py            |   8 +-
 test/srt/models/test_embedding_models.py      |  10 +-
 test/srt/models/test_generation_models.py     |  22 +-
 11 files changed, 319 insertions(+), 126 deletions(-)
 create mode 100644 python/sglang/test/long_prompt

diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py
index c2b956e1dad..ee227849cf8 100644
--- a/python/sglang/bench_latency.py
+++ b/python/sglang/bench_latency.py
@@ -64,7 +64,7 @@ class BenchArgs:
     run_name: str = "before"
     batch_size: Tuple[int] = (1,)
     input_len: Tuple[int] = (1024,)
-    output_len: Tuple[int] = (4,)
+    output_len: Tuple[int] = (16,)
     result_filename: str = ""
     correctness_test: bool = False
     # This is only used for correctness test
diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py
index 1568cf6d96a..49b86ad191a 100644
--- a/python/sglang/srt/layers/radix_attention.py
+++ b/python/sglang/srt/layers/radix_attention.py
@@ -34,6 +34,7 @@ def __init__(
         scaling: float,
         num_kv_heads: int,
         layer_id: int,
+        sliding_window_size: int = -1,
         logit_cap: int = -1,
         v_head_dim: int = -1,
     ):
@@ -46,6 +47,7 @@ def __init__(
         self.v_head_dim = v_head_dim if v_head_dim != -1 else head_dim
         self.scaling = scaling
         self.layer_id = layer_id
+        self.sliding_window_size = sliding_window_size
 
         if (
             not global_server_args_dict.get("disable_flashinfer", False)
@@ -113,39 +115,51 @@ def decode_forward_triton(self, q, k, v, input_metadata: InputMetadata):
         return o
 
     def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata):
+        # using two wrappers is unnecessary in the current PR, but are prepared for future PRs
+        prefill_wrapper_ragged = input_metadata.flashinfer_prefill_wrapper_ragged
+        prefill_wrapper_paged = input_metadata.flashinfer_prefill_wrapper_paged
+        if self.sliding_window_size != -1:
+            prefill_wrapper_ragged = prefill_wrapper_ragged[0]
+            prefill_wrapper_paged = prefill_wrapper_paged[0]
+        else:
+            if isinstance(prefill_wrapper_ragged, list):
+                prefill_wrapper_ragged = prefill_wrapper_ragged[1]
+            if isinstance(prefill_wrapper_paged, list):
+                prefill_wrapper_paged = prefill_wrapper_paged[1]
+
         if not input_metadata.flashinfer_use_ragged:
             self.store_kv_cache(k, v, input_metadata)
 
-            o = input_metadata.flashinfer_prefill_wrapper_paged.forward(
+            o = prefill_wrapper_paged.forward(
                 q.contiguous().view(-1, self.tp_q_head_num, self.head_dim),
                 input_metadata.token_to_kv_pool.get_kv_buffer(self.layer_id),
                 causal=True,
                 sm_scale=self.scaling,
+                window_left=self.sliding_window_size,
                 logits_soft_cap=self.logit_cap,
             )
         else:
-            o1, s1 = (
-                input_metadata.flashinfer_prefill_wrapper_ragged.forward_return_lse(
-                    q.contiguous().view(-1, self.tp_q_head_num, self.head_dim),
-                    k.contiguous().view(-1, self.tp_k_head_num, self.head_dim),
-                    v.contiguous().view(-1, self.tp_v_head_num, self.head_dim),
-                    causal=True,
-                    sm_scale=self.scaling,
-                    logits_soft_cap=self.logit_cap,
-                )
+            o1, s1 = prefill_wrapper_ragged.forward_return_lse(
+                q.contiguous().view(-1, self.tp_q_head_num, self.head_dim),
+                k.contiguous().view(-1, self.tp_k_head_num, self.head_dim),
+                v.contiguous().view(-1, self.tp_v_head_num, self.head_dim),
+                causal=True,
+                sm_scale=self.scaling,
+                window_left=self.sliding_window_size,
+                logits_soft_cap=self.logit_cap,
             )
 
             if input_metadata.extend_no_prefix:
                 o = o1
             else:
-                o2, s2 = (
-                    input_metadata.flashinfer_prefill_wrapper_paged.forward_return_lse(
-                        q.contiguous().view(-1, self.tp_q_head_num, self.head_dim),
-                        input_metadata.token_to_kv_pool.get_kv_buffer(self.layer_id),
-                        causal=False,
-                        sm_scale=self.scaling,
-                        logits_soft_cap=self.logit_cap,
-                    )
+                # TODO window attention + radix attention will come up in next PR
+                assert self.sliding_window_size == -1
+                o2, s2 = prefill_wrapper_paged.forward_return_lse(
+                    q.contiguous().view(-1, self.tp_q_head_num, self.head_dim),
+                    input_metadata.token_to_kv_pool.get_kv_buffer(self.layer_id),
+                    causal=False,
+                    sm_scale=self.scaling,
+                    logits_soft_cap=self.logit_cap,
                 )
 
                 o, _ = merge_state(o1, s1, o2, s2)
@@ -158,9 +172,16 @@ def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata):
         return o.view(-1, self.tp_q_head_num * self.head_dim)
 
     def decode_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata):
+        decode_wrapper = input_metadata.flashinfer_decode_wrapper
+        if self.sliding_window_size != -1:
+            decode_wrapper = decode_wrapper[0]
+        else:
+            if isinstance(decode_wrapper, list):
+                decode_wrapper = decode_wrapper[1]
+
         self.store_kv_cache(k, v, input_metadata)
 
-        o = input_metadata.flashinfer_decode_wrapper.forward(
+        o = decode_wrapper.forward(
             q.contiguous().view(-1, self.tp_q_head_num, self.head_dim),
             input_metadata.token_to_kv_pool.get_kv_buffer(self.layer_id),
             sm_scale=self.scaling,
diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py
index eb7aaaf2c10..3b2ee9de062 100644
--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -16,7 +16,7 @@
 """ModelRunner runs the forward passes of the models."""
 from dataclasses import dataclass
 from enum import IntEnum, auto
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional
 
 import numpy as np
 import torch
@@ -154,6 +154,7 @@ def from_schedule_batch(
         model_runner: "ModelRunner",
         batch: ScheduleBatch,
         forward_mode: ForwardMode,
+        sliding_window_size: Optional[int] = None,
     ):
         ret = cls(
             forward_mode=forward_mode,
@@ -197,7 +198,7 @@ def from_schedule_batch(
             ):
                 flashinfer_use_ragged = True
             ret.init_flashinfer_handlers(
-                model_runner, prefix_lens, flashinfer_use_ragged
+                model_runner, prefix_lens, flashinfer_use_ragged, sliding_window_size
             )
 
         return ret
@@ -216,7 +217,11 @@ def init_triton_args(self, batch: ScheduleBatch, prefix_lens):
             self.triton_max_extend_len = int(torch.max(extend_seq_lens))
 
     def init_flashinfer_handlers(
-        self, model_runner, prefix_lens, flashinfer_use_ragged
+        self,
+        model_runner,
+        prefix_lens,
+        flashinfer_use_ragged,
+        sliding_window_size=None,
     ):
         update_flashinfer_indices(
             self.forward_mode,
@@ -225,6 +230,7 @@ def init_flashinfer_handlers(
             self.seq_lens,
             prefix_lens,
             flashinfer_use_ragged=flashinfer_use_ragged,
+            sliding_window_size=sliding_window_size,
         )
 
         (
@@ -248,6 +254,7 @@ def update_flashinfer_indices(
     prefix_lens,
     flashinfer_decode_wrapper=None,
     flashinfer_use_ragged=False,
+    sliding_window_size=None,
 ):
     """Init auxiliary variables for FlashInfer attention backend."""
     num_qo_heads = model_runner.model_config.num_attention_heads // model_runner.tp_size
@@ -255,65 +262,145 @@ def update_flashinfer_indices(
     head_dim = model_runner.model_config.head_dim
     batch_size = len(req_pool_indices)
 
-    if flashinfer_use_ragged:
-        paged_kernel_lens = prefix_lens
-    else:
-        paged_kernel_lens = seq_lens
-
-    kv_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda")
-    kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
-    req_pool_indices_cpu = req_pool_indices.cpu().numpy()
-    paged_kernel_lens_cpu = paged_kernel_lens.cpu().numpy()
-    kv_indices = torch.cat(
-        [
-            model_runner.req_to_token_pool.req_to_token[
-                req_pool_indices_cpu[i], : paged_kernel_lens_cpu[i]
-            ]
-            for i in range(batch_size)
-        ],
-        dim=0,
-    ).contiguous()
-    kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda")
-
-    if forward_mode == ForwardMode.DECODE:
-        # CUDA graph uses different flashinfer_decode_wrapper
-        if flashinfer_decode_wrapper is None:
-            flashinfer_decode_wrapper = model_runner.flashinfer_decode_wrapper
-
-        flashinfer_decode_wrapper.end_forward()
-        flashinfer_decode_wrapper.begin_forward(
-            kv_indptr,
-            kv_indices,
-            kv_last_page_len,
-            num_qo_heads,
-            num_kv_heads,
-            head_dim,
-            1,
-        )
-    else:
-        # extend part
-        qo_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda")
-        qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0)
-
+    if sliding_window_size is None:
         if flashinfer_use_ragged:
-            model_runner.flashinfer_prefill_wrapper_ragged.end_forward()
-            model_runner.flashinfer_prefill_wrapper_ragged.begin_forward(
-                qo_indptr,
+            paged_kernel_lens = prefix_lens
+        else:
+            paged_kernel_lens = seq_lens
+
+        kv_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda")
+        kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
+        req_pool_indices_cpu = req_pool_indices.cpu().numpy()
+        paged_kernel_lens_cpu = paged_kernel_lens.cpu().numpy()
+        kv_indices = torch.cat(
+            [
+                model_runner.req_to_token_pool.req_to_token[
+                    req_pool_indices_cpu[i], : paged_kernel_lens_cpu[i]
+                ]
+                for i in range(batch_size)
+            ],
+            dim=0,
+        ).contiguous()
+        kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda")
+
+        if forward_mode == ForwardMode.DECODE:
+            # CUDA graph uses different flashinfer_decode_wrapper
+            if flashinfer_decode_wrapper is None:
+                flashinfer_decode_wrapper = model_runner.flashinfer_decode_wrapper
+
+            flashinfer_decode_wrapper.end_forward()
+            flashinfer_decode_wrapper.begin_forward(
+                kv_indptr,
+                kv_indices,
+                kv_last_page_len,
+                num_qo_heads,
+                num_kv_heads,
+                head_dim,
+                1,
+            )
+        else:
+            # extend part
+            qo_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda")
+            qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0)
+
+            if flashinfer_use_ragged:
+                model_runner.flashinfer_prefill_wrapper_ragged.end_forward()
+                model_runner.flashinfer_prefill_wrapper_ragged.begin_forward(
+                    qo_indptr,
+                    qo_indptr,
+                    num_qo_heads,
+                    num_kv_heads,
+                    head_dim,
+                )
+
+            # cached part
+            model_runner.flashinfer_prefill_wrapper_paged.end_forward()
+            model_runner.flashinfer_prefill_wrapper_paged.begin_forward(
                 qo_indptr,
+                kv_indptr,
+                kv_indices,
+                kv_last_page_len,
                 num_qo_heads,
                 num_kv_heads,
                 head_dim,
+                1,
             )
+    else:
+        kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda")
+        for wrapper_id in range(2):
+            if flashinfer_use_ragged:
+                paged_kernel_lens = prefix_lens
+            else:
+                paged_kernel_lens = seq_lens
 
-        # cached part
-        model_runner.flashinfer_prefill_wrapper_paged.end_forward()
-        model_runner.flashinfer_prefill_wrapper_paged.begin_forward(
-            qo_indptr,
-            kv_indptr,
-            kv_indices,
-            kv_last_page_len,
-            num_qo_heads,
-            num_kv_heads,
-            head_dim,
-            1,
-        )
+            if wrapper_id == 0 and forward_mode == ForwardMode.DECODE:
+                paged_kernel_lens = torch.minimum(
+                    paged_kernel_lens, torch.tensor(sliding_window_size)
+                )
+                kv_start_idx = seq_lens - paged_kernel_lens
+            else:
+                kv_start_idx = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+
+            kv_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda")
+            kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
+            req_pool_indices_cpu = req_pool_indices.cpu().numpy()
+            paged_kernel_lens_cpu = paged_kernel_lens.cpu().numpy()
+            kv_indices = torch.cat(
+                [
+                    model_runner.req_to_token_pool.req_to_token[
+                        req_pool_indices_cpu[i],
+                        kv_start_idx[i] : kv_start_idx[i] + paged_kernel_lens_cpu[i],
+                    ]
+                    for i in range(batch_size)
+                ],
+                dim=0,
+            ).contiguous()
+
+            if forward_mode == ForwardMode.DECODE:
+                # CUDA graph uses different flashinfer_decode_wrapper
+                if flashinfer_decode_wrapper is None:
+                    flashinfer_decode_wrapper = model_runner.flashinfer_decode_wrapper
+
+                flashinfer_decode_wrapper[wrapper_id].end_forward()
+                flashinfer_decode_wrapper[wrapper_id].begin_forward(
+                    kv_indptr,
+                    kv_indices,
+                    kv_last_page_len,
+                    num_qo_heads,
+                    num_kv_heads,
+                    head_dim,
+                    1,
+                )
+            else:
+                # extend part
+                qo_indptr = torch.zeros(
+                    (batch_size + 1,), dtype=torch.int32, device="cuda"
+                )
+                qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0)
+
+                if flashinfer_use_ragged:
+                    model_runner.flashinfer_prefill_wrapper_ragged[
+                        wrapper_id
+                    ].end_forward()
+                    model_runner.flashinfer_prefill_wrapper_ragged[
+                        wrapper_id
+                    ].begin_forward(
+                        qo_indptr,
+                        qo_indptr,
+                        num_qo_heads,
+                        num_kv_heads,
+                        head_dim,
+                    )
+
+                # cached part
+                model_runner.flashinfer_prefill_wrapper_paged[wrapper_id].end_forward()
+                model_runner.flashinfer_prefill_wrapper_paged[wrapper_id].begin_forward(
+                    qo_indptr,
+                    kv_indptr,
+                    kv_indices,
+                    kv_last_page_len,
+                    num_qo_heads,
+                    num_kv_heads,
+                    head_dim,
+                    1,
+                )
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 574ad365800..34a40c7d71a 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -295,7 +295,16 @@ def init_cublas(self):
         return c
 
     def init_flashinfer(self):
+        self.sliding_window_size = (
+            self.model.get_window_size()
+            if hasattr(self.model, "get_window_size")
+            else None
+        )
+
         if self.server_args.disable_flashinfer:
+            assert (
+                self.sliding_window_size is None
+            ), "turn on flashinfer to support window attention"
             self.flashinfer_prefill_wrapper_ragged = None
             self.flashinfer_prefill_wrapper_paged = None
             self.flashinfer_decode_wrapper = None
@@ -309,20 +318,54 @@ def init_flashinfer(self):
         else:
             use_tensor_cores = False
 
-        self.flashinfer_workspace_buffers = torch.empty(
-            2, global_config.flashinfer_workspace_size, dtype=torch.uint8, device="cuda"
-        )
-        self.flashinfer_prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
-            self.flashinfer_workspace_buffers[0], "NHD"
-        )
-        self.flashinfer_prefill_wrapper_paged = BatchPrefillWithPagedKVCacheWrapper(
-            self.flashinfer_workspace_buffers[1], "NHD"
-        )
-        self.flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
-            self.flashinfer_workspace_buffers[0],
-            "NHD",
-            use_tensor_cores=use_tensor_cores,
-        )
+        if self.sliding_window_size is None:
+            self.flashinfer_workspace_buffers = torch.empty(
+                2,
+                global_config.flashinfer_workspace_size,
+                dtype=torch.uint8,
+                device="cuda",
+            )
+            self.flashinfer_prefill_wrapper_ragged = (
+                BatchPrefillWithRaggedKVCacheWrapper(
+                    self.flashinfer_workspace_buffers[0], "NHD"
+                )
+            )
+            self.flashinfer_prefill_wrapper_paged = BatchPrefillWithPagedKVCacheWrapper(
+                self.flashinfer_workspace_buffers[1], "NHD"
+            )
+            self.flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+                self.flashinfer_workspace_buffers[0],
+                "NHD",
+                use_tensor_cores=use_tensor_cores,
+            )
+        else:
+            workspace_buffers = torch.empty(
+                4,
+                global_config.flashinfer_workspace_size,
+                dtype=torch.uint8,
+                device="cuda",
+            )
+            self.flashinfer_prefill_wrapper_ragged = []
+            self.flashinfer_prefill_wrapper_paged = []
+            self.flashinfer_decode_wrapper = []
+            for i in range(2):
+                self.flashinfer_prefill_wrapper_ragged.append(
+                    BatchPrefillWithRaggedKVCacheWrapper(
+                        workspace_buffers[2 * i + 0], "NHD"
+                    )
+                )
+                self.flashinfer_prefill_wrapper_paged.append(
+                    BatchPrefillWithPagedKVCacheWrapper(
+                        workspace_buffers[2 * i + 1], "NHD"
+                    )
+                )
+                self.flashinfer_decode_wrapper.append(
+                    BatchDecodeWithPagedKVCacheWrapper(
+                        workspace_buffers[2 * i + 0],
+                        "NHD",
+                        use_tensor_cores=use_tensor_cores,
+                    )
+                )
 
     def init_cuda_graphs(self):
         from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
@@ -358,7 +401,10 @@ def forward_decode(self, batch: ScheduleBatch):
             return self.cuda_graph_runner.replay(batch)
 
         input_metadata = InputMetadata.from_schedule_batch(
-            self, batch, ForwardMode.DECODE
+            self,
+            batch,
+            ForwardMode.DECODE,
+            sliding_window_size=self.sliding_window_size,
         )
 
         return self.model.forward(
@@ -368,7 +414,10 @@ def forward_decode(self, batch: ScheduleBatch):
     @torch.inference_mode()
     def forward_extend(self, batch: ScheduleBatch):
         input_metadata = InputMetadata.from_schedule_batch(
-            self, batch, forward_mode=ForwardMode.EXTEND
+            self,
+            batch,
+            forward_mode=ForwardMode.EXTEND,
+            sliding_window_size=self.sliding_window_size,
         )
         return self.model.forward(
             batch.input_ids, input_metadata.positions, input_metadata
@@ -377,7 +426,10 @@ def forward_extend(self, batch: ScheduleBatch):
     @torch.inference_mode()
     def forward_extend_multi_modal(self, batch: ScheduleBatch):
         input_metadata = InputMetadata.from_schedule_batch(
-            self, batch, forward_mode=ForwardMode.EXTEND
+            self,
+            batch,
+            forward_mode=ForwardMode.EXTEND,
+            sliding_window_size=self.sliding_window_size,
         )
         return self.model.forward(
             batch.input_ids,
diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py
index db87624d2df..463d5e50542 100644
--- a/python/sglang/srt/models/gemma2.py
+++ b/python/sglang/srt/models/gemma2.py
@@ -44,6 +44,12 @@
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
 
 
+# Aligned with HF's implementation, using sliding window inclusive with the last token
+# SGLang assumes exclusive
+def get_window_size(config):
+    return config.sliding_window - 1
+
+
 class GemmaRMSNorm(CustomOp):
     """RMS normalization for Gemma.
 
@@ -200,17 +206,14 @@ def __init__(
             dtype=torch.get_default_dtype(),
         )
 
-        # from vLLM: FIXME(woosuk): While Gemma 2 uses sliding window attention for every
-        # odd layer, vLLM currently ignores it and uses global attention for
-        # all layers.
-        use_sliding_window = layer_idx % 2 == 1 and config.sliding_window is not None
-        del use_sliding_window  # Unused.
+        use_sliding_window = layer_idx % 2 == 0 and hasattr(config, "sliding_window")
         self.attn = RadixAttention(
             self.num_heads,
             self.head_dim,
             self.scaling,
             num_kv_heads=self.num_kv_heads,
             layer_id=layer_idx,
+            sliding_window_size=get_window_size(config) if use_sliding_window else -1,
             logit_cap=self.config.attn_logit_softcapping,
         )
 
@@ -403,6 +406,9 @@ def forward(
             input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata
         )
 
+    def get_window_size(self):
+        return get_window_size(self.config)
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 474c80b256f..5e7996b8014 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -17,9 +17,12 @@
 
 import argparse
 import dataclasses
+import logging
 import random
 from typing import List, Optional, Union
 
+logger = logging.getLogger(__name__)
+
 
 @dataclasses.dataclass
 class ServerArgs:
@@ -446,6 +449,15 @@ def check_server_args(self):
         assert not (
             self.dp_size > 1 and self.node_rank is not None
         ), "multi-node data parallel is not supported"
+        if "gemma-2" in self.model_path.lower():
+            logger.info(
+                f"When using sliding window in gemma-2, disable radix_cache, regex_jump_forward, and turn on flashinfer."
+            )
+            self.disable_radix_cache = True
+            self.disable_regex_jump_forward = True
+            self.disable_flashinfer = False
+            self.disable_cuda_graph = True
+            self.chunked_prefill_size = None
 
 
 @dataclasses.dataclass
diff --git a/python/sglang/test/long_prompt b/python/sglang/test/long_prompt
new file mode 100644
index 00000000000..301d7e107db
--- /dev/null
+++ b/python/sglang/test/long_prompt
@@ -0,0 +1 @@
+You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\nIntroduction\n\nThroughout U.S. history, Congress has created advisory commissions to assist in the development of public policy. Among other contexts, commissions have been used following crisis situations, including the September 11, 2001, terrorist attacks and the 2008 financial crisis. In such situations, advisory commissions may potentially provide Congress with a high-visibility forum to assemble expertise that might not exist within the legislative environment; allow for the in-depth examination of complex, cross-cutting policy issues; and lend bipartisan credibility to a set of findings and recommendations.\nAs Congress considers its range of responses to the coronavirus pandemic, the creation of one or more congressional advisory commissions is an option that could provide a platform for evaluating various pandemic-related policy issues over time. Past congressional advisory commissions have retrospectively evaluated policy responses, brought together diverse groups of experts, and supplemented existing congressional oversight mechanisms. Policymakers may determine that creating an advisory commission is unnecessary and instead prefer to utilize existing congressional oversight structures, such as standing or select committees, or already established oversight entities.\nThis report provides a comparative analysis of five proposed congressional advisory commissions that would investigate various aspects of the COVID-19 pandemic. The five proposed commissions are found in H.R. 6429 (the National Commission on COVID-19 Act, sponsored by Representative Stephanie Murphy), H.R. 6431 (the Made in America Emergency Preparedness Act, sponsored by Representative Brian Fitzpatrick), H.R. 6440 (the Pandemic Rapid Response Act, sponsored by Representative Rodney Davis), H.R. 6455 (the COVID-19 Commission Act, sponsored by Representative Bennie Thompson), and H.R. 6548 (the National Commission on the COVID-19 Pandemic in the United States Act, sponsored by Representative Adam Schiff). The overall structures of each of the proposed commissions are similar in many respects, both to each other and to previous independent advisory entities established by Congress. Specifically, the proposed commissions would (1) exist temporarily; (2) serve in an advisory capacity; and (3) report a work product detailing the commission\'s findings, conclusions, and recommendations. That said, each particular proposed commission has distinctive elements, particularly concerning its membership structure, appointment structure, and time line for reporting its work product to Congress.\nThis report compares the (1) membership structure, (2) appointment structure, (3) rules of procedure and operation, (4) duties and reporting requirements, (5) powers of the commission, (6) staffing issues, and (7) funding for each of the proposed COVID-19 commissions. Table 1 (at the end of this report) provides a side-by-side comparison of major provisions of the five proposals.\n\n	Membership Structure\n\nSeveral matters related to a commission\'s membership structure might be considered. They include the size of a commission, member qualifications, compensation of commission members, and requirements for partisan balance. \n\n		Size of Commission\n\nIn general, there is significant variation in the size of congressional advisory commissions. Among 155 identified congressional commissions created between the 101 st Congress and the 115 th Congress, the median size was 12 members, with the smallest commission having 5 members and the largest 33 members.\nThe membership structure of each of the five proposed commissions is similar to previous independent advisory entities created by Congress. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 would each create a 10-member entity. H.R. 6455 would create a 25-member entity.\n\n		Qualifications\n\nPast legislation creating congressional commissions has often required or suggested that commission members possess certain substantive qualifications. Such provisions arguably make it more likely that the commission is populated with genuine experts in the policy area, which may improve the commission\'s final work product.\nH.R. 6455 would provide that commissioners \"shall be a United States person with significant expertise\" in a variety of fields related to public health and public administration. H.R. 6440 , H.R. 6429 , H.R. 6431 , and H.R. 6548 would provide \"the sense of Congress\" that commission members should be \"prominent U.S. citizens\" who are nationally recognized experts in a variety of fields relevant to the pandemic and response efforts. In addition, H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 all prohibit the appointment of federal, state, and local government employees and officers. H.R. 6455 would prohibit federal employees from being commission members.\n\n		Compensation of Commission Members\n\nSome congressional commissions have compensated their members. For example, the National Commission on Terrorist Attacks Upon the United States (9/11 Commission) and the Financial Crisis Inquiry Commission provided that commission members could be compensated at a daily rate of basic pay. Nearly all have reimbursed members for travel expenses. Those that have provided for commissioner compensation most frequently provided compensation at the daily equivalent of level IV of the Executive Schedule.\nEach of the five proposals would provide that commission members be compensated at a rate \"not to exceed the daily equivalent of the annual rate of basic pay\" for level IV of the Executive Schedule, \"for each day during which that member is engaged in the actual performance of duties of the Commission.\" Members of three proposed commissions would receive travel expenses, including a per diem.\n\n		Partisan Limitations\n\nEach proposal provides a limit on the number of members appointed from the same political party. H.R. 6455 would provide that not more than 13 of its 25 members may be from the same party. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 would provide that not more than 5 (of 10) members are from the same party. Most previous advisory entities created by Congress do not impose formal partisan restrictions on the membership structure. It may also be difficult to assess the political affiliation of potential members, who may have no formal affiliation (voter registration, for example) with a political party. Instead, most past advisory commissions usually achieve partisan balance through the appointment structure; for instance, by providing equal (or near-equal) numbers of appointments to congressional leaders of each party.\n\n	Appointment Structure\n\nPast congressional commissions have used a wide variety of appointment structures. Considerations regarding appointment structures include partisan balance, filling vacancies, and the time line for making commission appointments.\nThe statutory scheme may directly designate members of the commission, such as a specific cabinet official or a congressional leader. In other cases, selected congressional leaders, often with balance between the parties, appoint commission members. A third common statutory scheme is to have selected leaders, such as committee chairs and ranking members, recommend candidates for appointment to a commission. These selected leaders may act either in parallel or jointly, and the recommendation may be made either to other congressional leaders, such as the Speaker of the House and President pro tempore of the Senate, or to the President.\nEach of the five commission proposals would delegate most or all appointment authority to congressional leaders (including chamber, party, and committee leaders; see Table 1 for details). Additionally, H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 provide for one appointment to be made by the President. H.R. 6429 , H.R. 6431 , and H.R. 6548 would have the President appoint the commission\'s chair. H.R. 6455 has its membership appointed by the chairs and ranking members of designated House and Senate committees, and the Joint Economic Committee. H.R. 6455 does not provide any executive branch appointments.\nAttention to the proper balance between the number of members appointed by congressional leaders and by other individuals (such as the President), or to the number of Members of Congress required to be among the appointees, or to the qualifications of appointees, can be significant factors in enabling a commission to fulfill its congressional mandate.\nIn general, a commission\'s appointment scheme can impact both the commission\'s ability to fulfill its statutory duties and its final work product. For instance, if the scheme provides only for the appointment of Members of Congress to the commission, it arguably might not have the technical expertise or diversity of knowledge to complete its duties within the time given by statute. Similarly, if the appointment scheme includes qualifying provisos so specific that only a small set of private citizens could serve on the panel, the commission\'s final work product may arguably only represent a narrow range of viewpoints. None of the proposed COVID-19 commissions specify whether Members of Congress may serve on the commission.\n\n		Partisan Balance in Appointment Authority\n\nMost previous congressional advisory commissions have been structured to be bipartisan, with an even (or near-even) split of appointments between leaders of the two major parties. By achieving a nonpartisan or bipartisan character, congressional commissions may make their findings and recommendations more politically acceptable to diverse viewpoints. The bipartisan or nonpartisan arrangement can give recommendations strong credibility, both in Congress and among the public, even when dealing with divisive public policy issues. Similarly, commission recommendations that are perceived as partisan may have difficulty gaining support in Congress.\nIn some cases, however, bipartisanship also can arguably impede a commission\'s ability to complete its mandate. In situations where a commission is tasked with studying divisive or partisan issues, the appointment of an equal number of majority and minority commissioners may serve to promote partisanship within the commission rather than suppress it, raising the possibility of deadlock where neither side can muster a majority to act.\nEach of the five proposals employs a structure where leaders in both the majority and minority parties in Congress would make appointments. H.R. 6429 , H.R. 6431 , and H.R. 6548 would provide for five majority and five minority appointments, including one for the President. H.R. 6440 would include two each by the Senate majority leader, the Senate minority leader, and the Speaker of the House, with one appointment by the House minority leader and one by the President, and the chair appointed by the Speaker and vice chair appointed by the Senate majority leader. H.R. 6455 would have 12 majority and 12 minority appointments made by the 12 committee chairs and ranking members and one member jointly appointed by the chair and vice chair of the Joint Economic Committee.\n\n		Vacancies\n\nAll five proposals provide that vacancies on the commission will not affect its powers and would be filled in the same manner as the original appointment.\n\n		Deadline for Appointments\n\nThree of the bills propose specific deadlines for the appointment of commissioners. H.R. 6429 and H.R. 6548 provide that appointments are made between specific dates in January or February 2021. Further, H.R. 6429 provides that commission members could be appointed in September 2020, if there is no longer a COVID-19 public health emergency in effectâas determined by the Secretary of Health and Human Servicesâas of August 31, 2020. H.R. 6440 would require all appointments be made by December 15, 2020. H.R. 6455 would require appointments to be made within 45 days after enactment. H.R. 6429 , H.R. 6440 , and H.R. 6548 would start the commission\'s work in early 2021, as the commission cannot operate without the appointment of members. H.R. 6429 , however would provide that the proposed commission\'s work would begin no later than October 31, 2020, if members are appointed in September 2020. H.R. 6431 does not specify a deadline for the appointment of members.\nTypically, deadlines for appointment can range from several weeks to several months. For example, the deadline for appointments to the Antitrust Modernization Commission was 60 days after the enactment of its establishing act. The deadline for appointment to the Commission on Wartime Contracting in Iraq and Afghanistan was 120 days from the date of enactment. The deadline for appointment to the 9/11 Commission was December 15, 2002, 18 days after enactment of the act.\n\n	Rules of Procedure and Operations\n\nWhile most statutes that authorize congressional advisory commissions do not provide detailed procedures for how the commission should conduct its business, the statutory language may provide a general structure, including a mechanism for selecting a chair and procedures for creating rules. None of the five COVID-19 commission proposals contain language that directs the process for potentially adopting rules of procedure. For a comparison of each proposed commission\'s specified rules of procedures and operations, see Table 1 .\n\n		Chair Selection\n\nEach bill provides for the selection of a chair and/or vice chair of the commission. H.R. 6429 , H.R. 6431 , and H.R. 6548 would have the chair appointed by the President and the vice chair appointed by congressional leaders of the political party opposite the President. H.R. 6440 would have the chair appointed by the Speaker of the House (in consultation with the Senate majority leader and the House minority leader) and the vice chair appointed by the Senate majority leader (in consultation with the Speaker of the House and the Senate minority leader). H.R. 6455 would have the chair and vice chair chosen from among commission members by a majority vote of the commission, and would require the chair and vice chair to have \"significant experience\" in areas to be studied by the commission.\n\n		Initial Meeting Deadline\n\nAs with the timing of commission appointments, some authorizing statutes are prescriptive in when the commission\'s first meeting should take place. Three of the bills analyzed here provide specific time lines for the commission\'s first meeting. H.R. 6429 would require the first meeting to be no later than March 15, 2021, unless members are appointed in September 2020 (if no public health emergency exists). H.R. 6455 would require the first meeting within 45 days after the appointment of all commission members, which isâgiven the 45-day deadline for appointmentâeffectively a maximum of 90 days after enactment. H.R. 6548 would direct the commission to hold its initial meeting \"as soon as practicable,\" but not later than March 5, 2021. H.R. 6431 and H.R. 6440 do not provide for an initial meeting deadline. Instead, they direct the commission to meet \"as soon as practicable.\" \n\n		Quorum\n\nMost commission statutes provide that a quorum will consist of a particular number of commissioners, usually a majority, but occasionally a supermajority. All five bills would provide for a quorum requirement. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 would define a quorum as 6 (of 10) members. H.R. 6455 would provide that a quorum is 18 of 25 members (72%).\n\n		Public Access\n\nAll five commission bills would require commission meetings to be open to the public. Each bill would also require that reports be made publicly available.\n\n		Formulating Other Rules of Procedure and Operations\n\nAbsent statutory guidance (eithe r in general statutes or in individual statutes authorizing commissions), advisory entities vary widely in how they adopt their rules of procedure. In general, three models exist: formal written rules, informal rules, and the reliance on norms. Any individual advisory entity might make use of all three of these models for different types of decisionmaking. \nThe choice to adopt written rules or rely on informal norms to guide commission procedure may be based on a variety of factors, such as the entity\'s size, the frequency of meetings, member preferences regarding formality, the level of collegiality among members, and the amount of procedural guidance provided by the entity\'s authorizing statute. Regardless of how procedural issues are handled, protocol for decisionmaking regarding the following operational issues may be important for the commission to consider at the outset of its existence: eligibility to vote and proxy rules; staff hiring, compensation, and work assignments; hearings, meetings, and field visits; nonstaff expenditures and contracting; reports to Congress; budgeting; and procedures for future modification of rules. None of the five COVID-19 commission proposals specify that the proposed commission must adopt written rules.\n\n		FACA Applicability\n\nThe Federal Advisory Committee Act (FACA) mandates certain structural and operational requirements, including formal reporting and oversight procedures, for certain federal advisory bodies that advise the executive branch. Three proposals ( H.R. 6429 , H.R. 6431 , and H.R. 6548 ) specifically exempt the proposed commission from FACA. Of the remaining two, FACA would also likely not apply to the commission proposed in H.R. 6455 because it would be appointed entirely by Members of Congress, although it only specifies that its final report is public, not whether it is specifically sent to Congress and/or the President. It is not clear that FACA would apply to the commission proposed in H.R. 6440 . Although it includes a presidential appointment and its report would be sent to both Congress and the President, its establishment clause specifies that the commission \"is established in the legislative branch,\" and a super-majority of its members would be appointed by Congress.\n\n	Duties and Reporting Requirements\n\nMost congressional commissions are generally considered policy commissionsâtemporary bodies that study particular policy problems and report their findings to Congress or review a specific event. \n\n		General Duties\n\nAll five of the proposed commissions would be tasked with duties that are analogous to those of past policy commissions. While the specific mandates differ somewhat, all proposed commissions are tasked with investigating aspects of the COVID-19 pandemic and submitting one or more reports that include the commission\'s findings, conclusions, and recommendations for legislative action. H.R. 6440 would specifically require the commission to avoid unnecessary duplication of work being conducted by the Government Accountability Office (GAO), congressional committees, and executive branch agency and independent commission investigations.\n\n		Reports\n\nEach proposed commission would be tasked with issuing a final report detailing its findings, conclusions, and recommendations. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 would provide that the commission \"may submit\" interim reports to Congress and the President, but do not provide time lines on when those reports might be submitted. In each case, the interim report would need to be agreed to by a majority of commission members. H.R. 6431 would also require the commission to submit a report on actions taken by the states and a report on essential products, materials, ingredients, and equipment required to fight pandemics.\nH.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 also specify that final reports shall be agreed to by a majority of commission members. H.R. 6455 does not specify a vote threshold for approval of its report.\nNone of the bills make specific provisions for the inclusion of minority viewpoints. Presumably this would leave each commission with discretion on whether to include or exclude minority viewpoints. Past advisory entities have been proposed or established with a variety of statutory reporting conditions, including the specification of majority or super-majority rules for report adoption and provisions requiring the inclusion of minority viewpoints. In practice, advisory bodies that are not given statutory direction on these matters have tended to work under simple-majority rules for report adoption.\n\n		Report Deadlines\n\nH.R. 6429 would require a final report one year after the commission\'s initial meeting. H.R. 6431 and H.R. 6440 would require a final report not later than 18 months after enactment. H.R. 6455 would require a final report to be published not later than 18 months after the commission\'s first meeting. \nH.R. 6548 would require a final report by October 15, 2021. This deadline could be extended by 90 days upon a vote of no fewer than 8 (out of 10) commission members. The commission could vote to extend its final report deadline up to three times, and would be required to notify Congress, the President, and the public of any such extension.\nWhile such a deadline would potentially give the commission a defined period of time to complete its work, setting a particular date for report completion could potentially create unintended time constraints. Any delay in the passage of the legislation or in the appointment process would reduce the amount of time the commission has to complete its work, even with the opportunity for the commission to extend its own deadline up to three times.\nThe length of time a congressional commission has to complete its work is arguably one of the most consequential decisions when designing an advisory entity. If the entity has a short window of time, the quality of its work product may suffer or it may not be able to fulfill its statutory mandate on time.\nOn the other hand, if the commission is given a long period of time to complete its work, it may undermine one of a commission\'s primary legislative advantages, the timely production of expert advice on a current matter. A short deadline may also affect the process of standing up a new commission. The selection of commissioners, recruitment of staff, arrangement of office space, and other logistical matters may require expedited action if short deadlines need to be met.\n\n		Report Submission\n\nOf the five proposed commissions, four ( H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 ) are directed to submit their reports to both Congress and the President. H.R. 6455 requires that the report is made public.\nMost congressional advisory commissions are required to submit their reports to Congress, and sometimes to the President or an executive department or agency head. For example, the National Commission on Severely Distressed Public Housing\'s final report was submitted to both Congress and the Secretary of Housing and Urban Development.\n\n		Commission Termination\n\nCongressional commissions are usually statutorily mandated to terminate. Termination dates for most commissions are linked to either a fixed period of time after the establishment of the commission, the selection of members, or the date of submission of the commission\'s final report. Alternatively, some commissions are given fixed calendar termination dates.\nAll five commission proposals would provide for the commission to terminate within a certain period of time following submission of its final report. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6455 would each direct the commission to terminate 60 days after the submission; H.R. 6548 specifies a time line of 90 days after submission.\n\n	Commission Powers\n\nEach of the five proposals would provide the proposed commission with certain powers to carry out its mission (see Table 1 for specifics). One general issue for commissions is who is authorized to execute such powers. In some cases, the commission itself executes its powers, with the commission deciding whether to devise rules and procedures for the general use of such power. In other cases, the legislation specifically authorizes the commission to give discretionary power to subcommittees or individual commission members. Finally, the legislation itself might grant certain powers to individual members of the commission, such as the chair.\n\n		Hearings and Evidence\n\nAll five bills would provide the proposed commission with the power to hold hearings, take testimony, and receive evidence. All five commissions would also be provided the power to administer oaths to witnesses.\n\n		Subpoenas\n\nFour of the bills would provide the commission with subpoena power. H.R. 6440 would not provide subpoena power to the commission. H.R. 6429 , H.R. 6431 , and H.R. 6548 would provide that subpoenas could only be issued by either (1) agreement of the chair and vice chair, or (2) the affirmative vote of 6 (of 10) commission members. H.R. 6455 would require that a subpoena could only be issued by either agreement of the chair and vice chair or an affirmative vote of 18 (of 25) commission members. All four bills that would provide subpoena power contain substantially similar judicial methods of subpoena enforcement.\n\n		Administrative Support\n\nAll five of the bills would provide that the commission receive administrative support from the General Services Administration (GSA). The GSA provides administrative support to dozens of federal entities, including congressional advisory commissions. Each of the five bills would provide that GSA be reimbursed for its services by the commission. Each bill also provides that other departments or agencies may provide funds, facilities, staff, and other services to the commission.\n\n		Other Powers\n\nWithout explicit language authorizing certain activities, commissions often cannot gather information, enter into contracts, use the U.S. mail like an executive branch entity, or accept donations or gifts. \nAll five bills direct that federal agencies provide information to the commission upon request. H.R. 6429 , H.R. 6431 , and H.R. 6548 would also provide that the commission could use the U.S. mails in the same manner as any department or agency, enter into contracts, and accept gifts or donations of services or property.\n\n	Staffing\n\nThe proposed COVID-19 commissions contain staffing provisions commonly found in congressional advisory commission legislation. Congressional advisory commissions are usually authorized to hire staff. Most statutes specify that the commission may hire a lead staffer, often referred to as a \"staff director,\" \"executive director,\" or another similar title, in addition to additional staff as needed. Rather than mandate a specific staff size, many commissions are instead authorized to appoint a staff director and other personnel as necessary, subject to the limitations of available funds.\nMost congressional commissions are also authorized to hire consultants, procure intermittent services, and request that federal agencies detail personnel to aid the work of the commission.\n\n		Director and Commission Staff\n\nFour of the bills provide that the commission may hire staff without regard to certain laws regarding the competitive service; H.R. 6440 does not specifically exempt the commission from such laws. Four bills ( H.R. 6429 , H.R. 6431 , H.R. 6455 , and H.R. 6548 ) would authorize, but not require, the commission to hire a staff director and additional staff, as appropriate. Four proposals would limit staff salaries to level V of the executive schedule. Three of the bills would specifically designate staff as federal employees for the purposes of certain laws, such as workman\'s compensation, retirement, and other benefits.\n\n		Detailees\n\nWhen authorized, some commissions can have federal agency staff detailed to the commission. All five bills would provide that federal employees could be detailed to the commission. Four bills would provide that the detailee would be without reimbursement to his or her home agency. H.R. 6440 would allow detailees on a reimbursable basis. \n\n		Experts and Consultants\n\nAll five bills would provide the commission with the authority to hire experts and consultants. Four of the bills limit the rate of pay for consultants to level IV of the Executive Schedule. H.R. 6440 does not specify a specific limit.\n\n		Security Clearances\n\nFour bills would provide that federal agencies and departments shall cooperate with the commission to provide members and staff appropriate security clearances. H.R. 6440 does not contain a security clearance provision.\n\n	Funding and Costs\n\nCommissions generally require funding to help meet their statutory goals. When designing a commission, therefore, policymakers may consider both how the commission will be funded, and how much funding the commission will be authorized to receive. Four of the five proposals specify a funding mechanism for the commission.\nHow commissions are funded and the amounts that they receive vary considerably. Several factors can contribute to overall commission costs. These factors might include the cost of hiring staff, contracting with outside consultants, and engaging administrative support, among others. Additionally, most commissions reimburse the travel expenditures of commissioners and staff, and some compensate their members. The duration of a commission can also significantly affect its cost; past congressional commissions have been designed to last anywhere from several months to several years.\n\n		Costs\n\nIt is difficult to estimate or predict the potential overall cost of any commission. Annual budgets for congressional advisory entities range from several hundred thousand dollars to millions of dollars annually. Overall expenses for any individual advisory entity depend on a variety of factors, the most important of which are the number of paid staff and the commission\'s duration and scope. Some commissions have few full-time staff; others employ large numbers, such as the National Commission on Terrorist Attacks Upon the United States, which had a full-time paid staff of nearly 80. Secondary factors that can affect commission costs include the number of commissioners, how often the commission meets or holds hearings, whether or not the commission travels or holds field hearings, and the publications the commission produces.\n\n		Authorized Funding\n\nThree of the bills ( H.R. 6429 , H.R. 6440 , and H.R. 6548 ) would authorize the appropriation of \"such sums as may be necessary\" for the commission, to be derived in equal amounts from the contingent fund of the Senate and the applicable accounts of the House of Representatives. H.R. 6429 and H.R. 6548 would provide that funds are available until the commission terminates. H.R. 6455 would authorize the appropriation of $4 million for the commission, to remain available until the commission terminates. H.R. 6431 does not include an authorization of appropriations.\n\n	Comparison of Proposals to Create a COVID-19 Commission\n\n Table 1 provides a side-by-side comparison of major provisions of the five proposals. For each bill, the membership structure, appointment structure, rules of procedure and operation, duties and reporting requirements, proposed commission powers, staffing provisions, and funding are compared.\n\nSummary:\n
\ No newline at end of file
diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py
index fadd56e8c23..c8357a16c6c 100644
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -15,6 +15,7 @@
 
 import json
 import multiprocessing
+import os
 from dataclasses import dataclass
 from typing import List, Union
 
@@ -31,8 +32,14 @@
     "The capital of the United Kindom is",
     "Today is a sunny day and I like",
     "AI is a field of computer science focused on",
+    "Apple is red. Banana is Yellow. " * 800 + "Apple is",
 ]
 
+dirpath = os.path.dirname(__file__)
+with open(os.path.join(dirpath, "long_prompt"), "r") as f:
+    long_prompt = f.read()
+DEFAULT_PROMPTS.append(long_prompt)
+
 NUM_TOP_LOGPROBS = 5
 
 
@@ -125,16 +132,14 @@ def start_model_process(
                         )
 
                         logits = self.model.forward(input_ids).logits[0]
-                        logprobs = F.log_softmax(
-                            logits, dim=-1, dtype=torch.float32
-                        ).tolist()
-                        # index_of_max = (lambda nums: nums.index(max(nums)))(logprobs[-1])
-                        # print("index", index_of_max)
-                        logprobs = [
-                            sorted(token_logprobs, reverse=True)[:NUM_TOP_LOGPROBS]
-                            for token_logprobs in logprobs
-                        ]
-                        prefill_logprobs.append(logprobs)
+                        logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+                        logprobs, top_indices = torch.topk(
+                            logprobs, k=NUM_TOP_LOGPROBS, dim=-1
+                        )
+                        # print("index", top_indices)
+                        prefill_logprobs.append(logprobs.tolist())
+                        del logits
+                        del logprobs
 
                     out_queue.put(
                         ModelOutput(
@@ -186,6 +191,7 @@ def __init__(
             tp_size=tp_size,
             dtype=get_dtype_str(torch_dtype),
             port=port,
+            mem_fraction_static=0.7,
         )
 
     def forward(
diff --git a/scripts/playground/reference_hf.py b/scripts/playground/reference_hf.py
index ac91b3bed40..d2d31161017 100644
--- a/scripts/playground/reference_hf.py
+++ b/scripts/playground/reference_hf.py
@@ -35,18 +35,17 @@ def normal_text(args):
         args.model_path,
         torch_dtype=torch.float16,
         low_cpu_mem_usage=True,
+        device_map="auto",
         trust_remote_code=True,
     )
     m.cuda()
 
-    print(m)
-
     prompts = [
         "The capital of France is",
         "The capital of the United Kindom is",
         "Today is a sunny day and I like",
     ]
-    max_new_tokens = 32
+    max_new_tokens = 16
 
     for p in prompts:
         if isinstance(p, str):
@@ -58,10 +57,11 @@ def normal_text(args):
             input_ids, do_sample=False, max_new_tokens=max_new_tokens
         )
         output_str = t.decode(output_ids[0])
-        print(output_str)
 
         prefill_logits = m.forward(input_ids).logits[0][-1]
+
         print("prefill logits", prefill_logits)
+        print(output_str)
 
 
 @torch.inference_mode()
diff --git a/test/srt/models/test_embedding_models.py b/test/srt/models/test_embedding_models.py
index 520e811a807..67e47d90d3b 100644
--- a/test/srt/models/test_embedding_models.py
+++ b/test/srt/models/test_embedding_models.py
@@ -53,11 +53,13 @@ def assert_close_prefill_logits(
             srt_logits = torch.Tensor(srt_outputs.embed_logits[i])
 
             similarities = torch.tensor(get_similarities(hf_logits, srt_logits))
+            print("max similarity diff", torch.max(abs(similarities - 1)))
 
-            tolerance = 1e-2
-            assert torch.all(
-                abs(similarities - 1) < tolerance
-            ), f"embeddings not all close"
+            if hf_logits.shape[0] <= 100:
+                tolerance = 1e-2
+                assert torch.all(
+                    abs(similarities - 1) < tolerance
+                ), f"embeddings not all close"
 
     def test_prefill_logits(self):
         for model, tp_size in MODELS:
diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py
index ca4f096e301..bb56ebdad79 100644
--- a/test/srt/models/test_generation_models.py
+++ b/test/srt/models/test_generation_models.py
@@ -20,8 +20,8 @@
 from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
 
 MODELS = [
-    ("meta-llama/Meta-Llama-3.1-8B-Instruct", 1),
-    ("google/gemma-2-2b", 1),
+    ("meta-llama/Meta-Llama-3.1-8B-Instruct", 1, 1.1),
+    ("google/gemma-2-2b", 1, 3),
 ]
 TORCH_DTYPES = [torch.float16]
 
@@ -35,6 +35,7 @@ def assert_close_prefill_logits_and_output_strs(
         tp_size,
         torch_dtype,
         max_new_tokens,
+        long_context_tolerance,
     ) -> None:
         with HFRunner(
             model_path, torch_dtype=torch_dtype, is_generation_model=True
@@ -53,15 +54,19 @@ def assert_close_prefill_logits_and_output_strs(
             hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i])
             srt_logprobs = torch.Tensor(srt_outputs.top_input_logprobs[i])
 
-            tolerance = 3e-2
-            assert torch.all(
-                abs(hf_logprobs - srt_logprobs) < tolerance
-            ), f"prefill logprobs not all close"
+            print("max_diff", torch.max(abs(hf_logprobs - srt_logprobs)))
+            if hf_logprobs.shape[0] <= 100:
+                tolerance = 3e-2
+                assert torch.all(
+                    abs(hf_logprobs - srt_logprobs) < tolerance
+                ), f"prefill logprobs not all close"
 
+        print(hf_outputs.output_strs)
+        print(srt_outputs.output_strs)
         assert hf_outputs.output_strs == srt_outputs.output_strs
 
-    def test_prefill_logits(self):
-        for model, tp_size in MODELS:
+    def test_prefill_logits_and_output_strs(self):
+        for model, tp_size, long_context_tolerance in MODELS:
             for torch_dtype in TORCH_DTYPES:
                 max_new_tokens = 8
                 self.assert_close_prefill_logits_and_output_strs(
@@ -70,6 +75,7 @@ def test_prefill_logits(self):
                     tp_size,
                     torch_dtype,
                     max_new_tokens,
+                    long_context_tolerance=long_context_tolerance,
                 )