diff --git a/python/sglang/srt/models/baichuan.py b/python/sglang/srt/models/baichuan.py index 2c04e5aeb03..641937a4af0 100644 --- a/python/sglang/srt/models/baichuan.py +++ b/python/sglang/srt/models/baichuan.py @@ -24,7 +24,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -330,7 +329,7 @@ def __init__( self, config: PretrainedConfig, position_embedding: str, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -404,7 +403,7 @@ class BaichuanForCausalLM(BaiChuanBaseForCausalLM): def __init__( self, config, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ): if config.hidden_size == 4096: # baichuan2 7b diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py index 3d1319e400b..e9110d06713 100644 --- a/python/sglang/srt/models/chatglm.py +++ b/python/sglang/srt/models/chatglm.py @@ -22,7 +22,6 @@ import torch from torch import nn from torch.nn import LayerNorm -from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -52,7 +51,7 @@ def __init__( self, config, layer_id: int = 0, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -188,7 +187,7 @@ def __init__( self, config, layer_id: int, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -260,7 +259,7 @@ class GLMTransformer(nn.Module): def __init__( self, config, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -308,7 +307,7 @@ class ChatGLMModel(nn.Module): def __init__( self, config, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -359,7 +358,7 @@ class ChatGLMForCausalLM(nn.Module): def __init__( self, config: ChatGLMConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoraConfig] = None, ): diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py index f2ad529633e..05d0010a563 100644 --- a/python/sglang/srt/models/commandr.py +++ b/python/sglang/srt/models/commandr.py @@ -45,7 +45,6 @@ from torch import nn from torch.nn.parameter import Parameter from transformers import PretrainedConfig -from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -320,7 +319,7 @@ def __init__( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py index 9d4fafd6369..76cf5494631 100644 --- a/python/sglang/srt/models/dbrx.py +++ b/python/sglang/srt/models/dbrx.py @@ -20,7 +20,6 @@ import torch import torch.nn as nn -from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -368,7 +367,7 @@ def __init__( self, config: DbrxConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ): super().__init__() self.config = config diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py index b320b5167ad..82565ed0ed6 100644 --- a/python/sglang/srt/models/deepseek.py +++ b/python/sglang/srt/models/deepseek.py @@ -21,7 +21,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -185,7 +184,7 @@ def __init__( rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -262,7 +261,7 @@ def __init__( self, config: PretrainedConfig, layer_id: int, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -331,7 +330,7 @@ class DeepseekModel(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -374,7 +373,7 @@ class DeepseekForCausalLM(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index ce632275a59..8889eed649b 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -21,7 +21,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, @@ -188,7 +187,7 @@ def __init__( rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, layer_id=None, ) -> None: @@ -336,7 +335,7 @@ def __init__( rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, layer_id=None, ) -> None: @@ -498,7 +497,7 @@ def __init__( self, config: PretrainedConfig, layer_id: int, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -594,7 +593,7 @@ class DeepseekV2Model(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -640,7 +639,7 @@ class DeepseekV2ForCausalLM(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() diff --git a/python/sglang/srt/models/exaone.py b/python/sglang/srt/models/exaone.py index f0b47786a70..2efb9404110 100644 --- a/python/sglang/srt/models/exaone.py +++ b/python/sglang/srt/models/exaone.py @@ -21,7 +21,6 @@ import torch from torch import nn -from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -295,7 +294,7 @@ def __init__( self, config, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index 5c345616124..238bcb309c6 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -21,7 +21,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding @@ -279,7 +279,7 @@ def __init__( config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: del lora_config # Unused. super().__init__() diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index 47fbd6334c1..8f7d0bee472 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -20,7 +20,7 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size # from vllm.model_executor.layers.rotary_embedding import GemmaRotaryEmbedding @@ -105,7 +105,7 @@ def __init__( head_dim: int, max_position_embeddings: int, rope_theta: float, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -190,7 +190,7 @@ def __init__( self, layer_idx: int, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -257,7 +257,7 @@ class Gemma2Model(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -336,7 +336,7 @@ class Gemma2ForCausalLM(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, ) -> None: diff --git a/python/sglang/srt/models/gpt_bigcode.py b/python/sglang/srt/models/gpt_bigcode.py index ad61b742fa7..0f38ba2c8d9 100644 --- a/python/sglang/srt/models/gpt_bigcode.py +++ b/python/sglang/srt/models/gpt_bigcode.py @@ -21,7 +21,7 @@ import torch from torch import nn from transformers import GPTBigCodeConfig -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -44,7 +44,7 @@ def __init__( self, layer_id: int, config: GPTBigCodeConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -145,7 +145,7 @@ def __init__( self, layer_id: int, config: GPTBigCodeConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -183,7 +183,7 @@ class GPTBigCodeModel(nn.Module): def __init__( self, config: GPTBigCodeConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, ): @@ -243,7 +243,7 @@ class GPTBigCodeForCausalLM(nn.Module): def __init__( self, config: GPTBigCodeConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, ): diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 80bdc2c4c5d..e7a0e06c57a 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -23,7 +23,6 @@ import torch.nn.functional as F from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -289,7 +288,7 @@ def __init__( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/internlm2.py b/python/sglang/srt/models/internlm2.py index 087793afcfa..1dd369e5e97 100644 --- a/python/sglang/srt/models/internlm2.py +++ b/python/sglang/srt/models/internlm2.py @@ -21,7 +21,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -254,7 +253,7 @@ def __init__( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py index 930c6838d85..543703c230b 100644 --- a/python/sglang/srt/models/llama.py +++ b/python/sglang/srt/models/llama.py @@ -22,7 +22,6 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -295,7 +294,7 @@ def __init__( self, config: LlamaConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/llama_classification.py b/python/sglang/srt/models/llama_classification.py index 58ec09ba035..bd101b9b1f8 100644 --- a/python/sglang/srt/models/llama_classification.py +++ b/python/sglang/srt/models/llama_classification.py @@ -18,7 +18,6 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.config import CacheConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from sglang.srt.layers.logits_processor import LogitsProcessorOutput @@ -32,7 +31,7 @@ def __init__( self, config: LlamaConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/llama_reward.py b/python/sglang/srt/models/llama_reward.py index fd868a62c16..2e9c0457f0a 100644 --- a/python/sglang/srt/models/llama_reward.py +++ b/python/sglang/srt/models/llama_reward.py @@ -18,7 +18,6 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.config import CacheConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from sglang.srt.layers.logits_processor import LogitsProcessorOutput @@ -33,7 +32,7 @@ def __init__( self, config: LlamaConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config @@ -92,7 +91,7 @@ def __init__( self, config: LlamaConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__(config, quant_config, cache_config) self.weights = self.Weights(config.hidden_size, self.num_labels) diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py index b9c1fa0aabd..0ee11489299 100644 --- a/python/sglang/srt/models/llava.py +++ b/python/sglang/srt/models/llava.py @@ -31,7 +31,6 @@ SiglipVisionModel, ) from transformers.models.llava.modeling_llava import LlavaMultiModalProjector -from vllm.config import CacheConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from sglang.srt.layers.quantization.base_config import QuantizationConfig @@ -450,7 +449,7 @@ def __init__( self, config: LlavaConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() @@ -472,7 +471,7 @@ def __init__( self, config: LlavaConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() @@ -505,7 +504,7 @@ def __init__( self, config: LlavaConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() diff --git a/python/sglang/srt/models/llavavid.py b/python/sglang/srt/models/llavavid.py index 82aa7c15d77..d874a472ef9 100644 --- a/python/sglang/srt/models/llavavid.py +++ b/python/sglang/srt/models/llavavid.py @@ -22,7 +22,6 @@ from torch import nn from transformers import CLIPVisionModel, LlavaConfig from transformers.models.llava.modeling_llava import LlavaMultiModalProjector -from vllm.config import CacheConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from sglang.srt.layers.quantization.base_config import QuantizationConfig @@ -36,7 +35,7 @@ def __init__( self, config: LlavaConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/minicpm.py b/python/sglang/srt/models/minicpm.py index 777d572f9c3..6436eb626e6 100644 --- a/python/sglang/srt/models/minicpm.py +++ b/python/sglang/srt/models/minicpm.py @@ -20,7 +20,6 @@ import torch from torch import nn -from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -278,7 +277,7 @@ def __init__( self, config, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/minicpm3.py b/python/sglang/srt/models/minicpm3.py index abdf107d3cb..9c8850787f4 100644 --- a/python/sglang/srt/models/minicpm3.py +++ b/python/sglang/srt/models/minicpm3.py @@ -21,7 +21,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -108,7 +107,7 @@ def __init__( rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, layer_id=None, ) -> None: @@ -252,7 +251,7 @@ def __init__( rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, layer_id=None, ) -> None: @@ -409,7 +408,7 @@ def __init__( self, config: PretrainedConfig, layer_id: int, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -501,7 +500,7 @@ class MiniCPM3Model(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -552,7 +551,7 @@ class MiniCPM3ForCausalLM(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index b72220fe5b9..6ad8023675e 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -21,7 +21,6 @@ import torch from torch import nn from transformers import MixtralConfig -from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.rotary_embedding import get_rope @@ -293,7 +292,7 @@ def __init__( self, config: MixtralConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py index f69d2ea59cb..7ceb990a819 100644 --- a/python/sglang/srt/models/mixtral_quant.py +++ b/python/sglang/srt/models/mixtral_quant.py @@ -23,7 +23,6 @@ import torch.nn.functional as F from torch import nn from transformers import MixtralConfig -from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -325,7 +324,7 @@ def __init__( self, config: MixtralConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/olmoe.py b/python/sglang/srt/models/olmoe.py index 3e851268d5f..92352809f17 100644 --- a/python/sglang/srt/models/olmoe.py +++ b/python/sglang/srt/models/olmoe.py @@ -23,7 +23,6 @@ import torch.nn.functional as F from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, @@ -298,7 +297,7 @@ class OlmoeForCausalLM(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index 614311c17d6..c69219d86a4 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -20,7 +20,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -243,7 +242,7 @@ def __init__( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ): super().__init__() self.config = config diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index d4beae4b375..bab09d37ce6 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -20,7 +20,6 @@ import torch from torch import nn -from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -268,7 +267,7 @@ def __init__( self, config: Qwen2Config, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index 2eed8ff4576..860c00a7ec2 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -23,7 +23,6 @@ import torch.nn.functional as F from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, @@ -160,7 +159,7 @@ def __init__( rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -236,7 +235,7 @@ def __init__( self, config: PretrainedConfig, layer_id: int, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -306,7 +305,7 @@ class Qwen2MoeModel(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -355,7 +354,7 @@ class Qwen2MoeForCausalLM(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py index b211037fced..6e6d5ea0e98 100644 --- a/python/sglang/srt/models/stablelm.py +++ b/python/sglang/srt/models/stablelm.py @@ -22,7 +22,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -241,7 +240,7 @@ def __init__( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/torch_native_llama.py b/python/sglang/srt/models/torch_native_llama.py index f40424ab0a2..29c92955fca 100644 --- a/python/sglang/srt/models/torch_native_llama.py +++ b/python/sglang/srt/models/torch_native_llama.py @@ -24,7 +24,6 @@ from torch import nn from torch.nn.parameter import Parameter from transformers import LlamaConfig -from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -380,7 +379,7 @@ def __init__( self, config: LlamaConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__() self.config = config diff --git a/python/sglang/srt/models/xverse.py b/python/sglang/srt/models/xverse.py index bd10606b598..42d873785e1 100644 --- a/python/sglang/srt/models/xverse.py +++ b/python/sglang/srt/models/xverse.py @@ -22,7 +22,6 @@ import torch from torch import nn from transformers import LlamaConfig -from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -297,7 +296,7 @@ def __init__( self, config: LlamaConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, efficient_weight_load=False, ) -> None: super().__init__() diff --git a/python/sglang/srt/models/xverse_moe.py b/python/sglang/srt/models/xverse_moe.py index 7ff25b34017..a5c5f4ccb78 100644 --- a/python/sglang/srt/models/xverse_moe.py +++ b/python/sglang/srt/models/xverse_moe.py @@ -19,7 +19,6 @@ import torch from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -183,7 +182,7 @@ def __init__( rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -260,7 +259,7 @@ def __init__( self, config: PretrainedConfig, layer_id: int, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -328,7 +327,7 @@ class XverseModel(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -371,7 +370,7 @@ class XverseMoeForCausalLM(nn.Module): def __init__( self, config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() diff --git a/python/sglang/srt/models/yivl.py b/python/sglang/srt/models/yivl.py index 52f930edcd7..e10fe5cf15f 100644 --- a/python/sglang/srt/models/yivl.py +++ b/python/sglang/srt/models/yivl.py @@ -20,7 +20,6 @@ import torch import torch.nn as nn from transformers import CLIPVisionModel, LlavaConfig -from vllm.config import CacheConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from sglang.srt.layers.quantization.base_config import QuantizationConfig @@ -32,7 +31,7 @@ def __init__( self, config: LlavaConfig, quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, + cache_config=None, ) -> None: super().__init__(config, quant_config, cache_config)