Manifest updates (#81)

* squashed modify filter for LoRA affinity modify filter for LoRA affinity * update llm service and llm server pool yaml, readme * remove ununsed method from metrics.go * add flowchart image * update size flowchart image * remove image name * update queueingThresholdLoRA to 50 * rollback filter related changes * rollback filter related changes in docs * addressing comments * addressing comments
kubernetes-sigs · Dec 10, 2024 · 83f701b · 83f701b
1 parent ca47aa2
commit 83f701b
Show file tree

Hide file tree

Showing 6 changed files with 158 additions and 45 deletions.
diff --git a/examples/poc/manifests/llmservice.yaml b/examples/poc/manifests/llmservice.yaml
@@ -1,4 +1,14 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: LLMServerPool
+metadata:
+  labels:
+  name: vllm-llama2-7b-pool
+spec:
+  targetPort: 8000
+  modelServerSelector:
+    "app": "vllm-llama2-7b-pool"
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: LLMService
 metadata:
   labels:
@@ -7,17 +17,84 @@ metadata:
   name: llmservice-sample
 spec:
   models:
-  - name: sql-code-assist
-  - name: npc-bot
+  - name: sql-lora
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: sql-lora
+      weight: 100	
+  - name: sql-lora-0
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: sql-lora-0
+      weight: 100	
+  - name: sql-lora-1
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: sql-lora-1
+      weight: 100	
+  - name: sql-lora-2
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: sql-lora-2
+      weight: 100	
+  - name: sql-lora-3
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: sql-lora-3
+      weight: 100	
+  - name: sql-lora-4
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: sql-lora-4
+      weight: 100	
+  - name: tweet-summary
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: tweet-summary
+      weight: 100	
+  - name: tweet-summary-0
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: tweet-summary-0
+      weight: 100	
+  - name: tweet-summary-1
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: tweet-summary-1
+      weight: 100	
+  - name: tweet-summary-2
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name:  tweet-summary-2
+      weight: 100	
+  - name:  tweet-summary-3
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: tweet-summary-3
+      weight: 100	
+  - name: tweet-summary-4
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: tweet-summary-4
+      weight: 100	
+  - name: meta-llama/Llama-2-7b-hf
     objective: 
       desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
     targetModels:
-    - name: npc-bot-v1
-      weight: 50
-    - name: npc-bot-v2
-      weight: 50 	
+    - name: meta-llama/Llama-2-7b-hf
+      weight: 100	
   poolRef:
   - kind: LLMServerPool
-    name: test-pool
-  - name: gemini-pool
-    kind: LLMServerPool
+    name: vllm-llama2-7b-pool
diff --git a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml
@@ -1,17 +1,32 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-llama2-7b-pool
+spec:
+  selector:
+    app: vllm-llama2-7b-pool
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
+  type: ClusterIP
+
+---
+
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm
+  name: vllm-llama2-7b-pool
   namespace: default
 spec:
-  replicas: 6
+  replicas: 3
   selector:
     matchLabels:
-      app: vllm
+      app: vllm-llama2-7b-pool
   template:
     metadata:
       labels:
-        app: vllm
+        app: vllm-llama2-7b-pool
     spec:
       containers:
         - name: lora

diff --git a/examples/poc/manifests/vllm/vllm-lora-service.yaml b/examples/poc/manifests/vllm/vllm-lora-service.yaml
diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml
@@ -1,44 +1,77 @@
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: pod-read
+rules:
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["llmservices"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["llmserverpools"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["discovery.k8s.io"]
+  resources: ["endpointslices"]
+  verbs: ["get", "watch", "list"]
+--- 
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: pod-read-binding
+subjects:
+- kind: ServiceAccount
+  name: default
+  namespace: default
+roleRef:
+  kind: ClusterRole
+  name: pod-read
+---
+
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: instance-gateway-ext-proc
+  name: inference-gateway-ext-proc
   namespace: default
   labels:
-    app: instance-gateway-ext-proc
+    app: inference-gateway-ext-proc
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: instance-gateway-ext-proc
+      app: inference-gateway-ext-proc
   template:
     metadata:
       labels:
-        app: instance-gateway-ext-proc
+        app: inference-gateway-ext-proc
     spec:
       containers:
-      - name: instance-gateway-ext-proc
+      - name: inference-gateway-ext-proc
         # TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/34) Update the image and args.
         image: <BUILT-IMAGE>
         args:
-        # TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/12) Remove this once ext proc can dynamically reconcile on LLMServerPool.
-        - -pods
-        - "vllm-78665f78c4-h4kx4,vllm-78665f78c4-hnz84"
-        - -podIPs
-        - "10.24.11.6:8000,10.24.5.7:8000"
+        - -serverPoolName 
+        - "vllm-llama2-7b-pool"
+        - -v
+        - "3"
+        - -serviceName
+        - "vllm-llama2-7b-pool"
         ports:
         - containerPort: 9002
+
       - name: curl
         image: curlimages/curl
         command: ["sleep", "3600"]
 ---
 apiVersion: v1
 kind: Service
 metadata:
-  name: instance-gateway-ext-proc
+  name: inference-gateway-ext-proc
   namespace: default
 spec:
   selector:
-    app: instance-gateway-ext-proc
+    app: inference-gateway-ext-proc
   ports:
     - protocol: TCP
       port: 9002
@@ -55,12 +88,12 @@ spec:
     - backendRefs:
       - group: ""
         kind: Service
-        name: instance-gateway-ext-proc
+        name: inference-gateway-ext-proc
         port: 9002
       processingMode:
         request:
           body: Buffered
-        response:
+        response: {}
       # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly.
       # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. 
       messageTimeout: 1000s

diff --git a/pkg/manifests/gateway.yaml b/pkg/manifests/gateway.yaml
@@ -3,9 +3,9 @@
 apiVersion: gateway.networking.k8s.io/v1
 kind: Gateway
 metadata:
-  name: <GATEWAY-NAME>
+  name: inference-gateway
 spec:
-  gatewayClassName: <GATEWAY-NAME>
+  gatewayClassName: inference-gateway
   listeners:
     - name: http
       protocol: HTTP
@@ -17,7 +17,7 @@ spec:
 apiVersion: gateway.networking.k8s.io/v1
 kind: GatewayClass
 metadata:
-  name: <GATEWAY-NAME>
+  name: inference-gateway
 spec:
   controllerName: gateway.envoyproxy.io/gatewayclass-controller
 ---
@@ -38,7 +38,7 @@ metadata:
   name: llm-route
 spec:
   parentRefs:
-    - name: <GATEWAY-NAME>
+    - name: inference-gateway
       sectionName: llm-gw
   rules:
   - backendRefs:

diff --git a/pkg/manifests/patch_policy.yaml b/pkg/manifests/patch_policy.yaml
@@ -7,7 +7,7 @@ spec:
   targetRef:
     group: gateway.networking.k8s.io
     kind: Gateway
-    name: <GATEWAY-NAME>
+    name: inference-gateway
   type: JSONPatch
   jsonPatches:
     # Necessary to create a cluster of the type: ORIGINAL_DST to allow for 
@@ -36,7 +36,7 @@ spec:
               max_requests: 40000
 
     - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration"
-      name: default/<GATEWAY-NAME>/llm-gw
+      name: default/inference-gateway/llm-gw
       operation:
         op: replace
         path: "/virtual_hosts/0/routes/0/route/cluster"