Merge pull request #33 from vectara/dev

Dev
vectara · Feb 23, 2024 · f78c24d · f78c24d
2 parents 4399273 + 52b069b
commit f78c24d
Showing 1 changed file with 39 additions and 0 deletions.
diff --git a/serving.proto b/serving.proto
@@ -69,6 +69,45 @@ message SummarizationRequest {
   // https://docs.vectara.com/docs/prompts/vectara-prompt-engine
   string prompt_text = 200;
 
+  // Vectara manages both system and user roles and prompts for the generative
+  // LLM out of the box by default.  However, Scale customers can override the
+  // prompt_text via this variable.  The prompt_text is in the form of an
+  // Apache Velocity template.  For more details on how to configure the
+  // prompt_text, see the long-form documentation at
+  // https://docs.vectara.com/docs/prompts/vectara-prompt-engine
+  // See https://vectara.com/pricing/ for more details on becoming a Scale customer.
+  string prompt_text = 200;
+
+  // Debugging the generative prompt is currently a Scale-only feature.
+  // See https://vectara.com/pricing/ for more details on becoming a Scale customer.
+  bool debug = 205;
+
+  // Controls the length of the summary.
+  // This is a rough estimate and not a hard limit: the end summary can be longer or shorter
+  // than this value.  This is currently a Scale-only feature.
+  // See https://vectara.com/pricing/ for more details on becoming a Scale customer.
+  uint32 response_chars = 210;
+
+  // Parameters for the summarizer model.  These are currently a Scale-only feature.
+  // See https://vectara.com/pricing/ for more details on becoming a Scale customer.
+  // WARNING: This is an experimental feature, and breakable at any point with virtually no
+  // notice. It is meant for experimentation to converge on optimal parameters that can then
+  // be set in the prompt definitions.
+  message ModelParams {
+    optional uint32 max_tokens = 5;
+    // The sampling temperature to use. Higher values make the summary more random, while lower
+    // values make it more focused and deterministic.
+    optional float temperature = 10;
+    // Higher values penalize new tokens based on their existing frequency in the text so far,
+    // decreasing the model's likelihood to repeat the same line verbatim.
+    optional float frequency_penalty = 15;
+    // Higher values penalize new tokens based on whether they appear in the text so far,
+    // increasing the model's likelihood to talk about new topics.
+    optional float presence_penalty = 20;
+  }
+  ModelParams model_params = 215;
+
+
   // If present, the query will be treated as a chat query.
   // When using chat, only one summarization request is allowed per query.
   ChatRequest chat = 225;