From d29c8842d55575bcd49cea1899de57c14c7b8800 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Tue, 10 Oct 2023 11:10:25 -0700
Subject: [PATCH] python bindings should be quiet by default

* disable llama.cpp logging unless GPT4ALL_VERBOSE_LLAMACPP envvar is
  nonempty
* make verbose flag for retrieve_model default false (but also be
  overridable via gpt4all constructor)

should be able to run a basic test:

```python
import gpt4all
model = gpt4all.GPT4All('/Users/aaron/Downloads/rift-coder-v0-7b-q4_0.gguf')
print(model.generate('def fib(n):'))
```

and see no non-model output when successful
---
 gpt4all-backend/llamamodel.cpp             | 18 +++++++++++++++++-
 gpt4all-bindings/python/gpt4all/gpt4all.py |  5 +++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
index 7fd9fce129b5..4f7ea78a8dd2 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -36,6 +36,17 @@ namespace {
 const char *modelType_ = "LLaMA";
 }
 
+static void null_log_callback(enum ggml_log_level, const char*, void*) {
+    return;
+}
+
+static bool llama_verbose() {
+    const char* var = getenv("GPT4ALL_VERBOSE_LLAMACPP");
+    if (var == nullptr) return false;
+    if (var[0] == 0) return false;
+    return true;
+}
+
 struct gpt_params {
     int32_t seed          = -1;   // RNG seed
     int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
@@ -144,7 +155,9 @@ bool LLamaModel::loadModel(const std::string &modelPath)
     d_ptr->params.use_mlock  = params.use_mlock;
 #endif
 #ifdef GGML_USE_METAL
-    std::cerr << "llama.cpp: using Metal" << std::endl;
+    if (llama_verbose()) {
+        std::cerr << "llama.cpp: using Metal" << std::endl;
+    }
     // metal always runs the whole model if n_gpu_layers is not 0, at least
     // currently
     d_ptr->params.n_gpu_layers = 1;
@@ -390,6 +403,9 @@ DLL_EXPORT bool magic_match(const char * fname) {
 }
 
 DLL_EXPORT LLModel *construct() {
+    if (!llama_verbose()) {
+        llama_log_set(null_log_callback, nullptr);
+    }
     return new LLamaModel;
 }
 }
diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py
index c6d5c9baa13f..8728648e9c84 100644
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -65,6 +65,7 @@ def __init__(
         allow_download: bool = True,
         n_threads: Optional[int] = None,
         device: Optional[str] = "cpu",
+        verbose: bool = False,
     ):
         """
         Constructor
@@ -89,7 +90,7 @@ def __init__(
         self.model_type = model_type
         self.model = pyllmodel.LLModel()
         # Retrieve model and download if allowed
-        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download)
+        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
         if device is not None:
             if device != "cpu":
                 self.model.init_gpu(model_path=self.config["path"], device=device)
@@ -117,7 +118,7 @@ def retrieve_model(
         model_name: str,
         model_path: Optional[str] = None,
         allow_download: bool = True,
-        verbose: bool = True,
+        verbose: bool = False,
     ) -> ConfigType:
         """
         Find model file, and if it doesn't exist, download the model.