add extra dimension support (#865)

Co-authored-by: Sophie Chen <sophiechen@microsoft.com>
microsoft · May 24, 2024 · 78d0402 · 78d0402
1 parent 5c01f9c
commit 78d0402
Show file tree

Hide file tree

Showing 6 changed files with 105 additions and 16 deletions.
diff --git a/scripts/.env.sample b/scripts/.env.sample
@@ -1,6 +1,7 @@
 # resource switch 
 FLAG_EMBEDDING_MODEL = "AOAI" # "AOAI" or "COHERE"
 FLAG_COHERE = "ENGLISH" # "MULTILINGUAL" or "ENGLISH" options for Cohere embedding models
+FLAG_AOAI = "V3" # "V2" or "V3" options for AOAI embedding models
 
 # update vector dimension based on model chosen
 VECTOR_DIMENSION = 1536 # change it to desired, e.g., 1536 for AOAI ada 002, 1024 for COHERE

diff --git a/scripts/config.json b/scripts/config.json
@@ -1,14 +1,14 @@
-[
-    {
-        "data_path": "<path to data>",
-        "location": "<azure region, e.g. 'westus2'>",
-        "subscription_id": "<subscription id>",
-        "resource_group": "<resource group name>",
-        "search_service_name": "<search service name to use or create>",
-        "index_name": "<index name to use or create>",
-        "chunk_size": 1024,
-        "token_overlap": 128,
-        "semantic_config_name": "default",
-        "language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"
-    }
+[
+    {
+        "data_path": "<path to data>",
+        "location": "<azure region, e.g. 'westus2'>",
+        "subscription_id": "<subscription id>",
+        "resource_group": "<resource group name>",
+        "search_service_name": "<search service name to use or create>",
+        "index_name": "<index name to use or create>",
+        "chunk_size": 1024,
+        "token_overlap": 128,
+        "semantic_config_name": "default",
+        "language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"
+    }
 ]
diff --git a/scripts/data_preparation.py b/scripts/data_preparation.py
@@ -232,7 +232,7 @@ def create_or_update_search_index(
             "type": "Collection(Edm.Single)",
             "searchable": True,
             "retrievable": True,
-            "dimensions": os.getenv("VECTOR_DIMENSION", 1536),
+            "dimensions": int(os.getenv("VECTOR_DIMENSION", 1536)),
             "vectorSearchConfiguration": vector_config_name
         })
 

diff --git a/scripts/data_utils.py b/scripts/data_utils.py
@@ -651,6 +651,7 @@ def get_embedding(text, embedding_model_endpoint=None, embedding_model_key=None,
 
     FLAG_EMBEDDING_MODEL = os.getenv("FLAG_EMBEDDING_MODEL", "AOAI")
     FLAG_COHERE = os.getenv("FLAG_COHERE", "ENGLISH")
+    FLAG_AOAI = os.getenv("FLAG_AOAI", "V3")
 
     if azure_credential is None and (endpoint is None or key is None):
         raise Exception("EMBEDDING_MODEL_ENDPOINT and EMBEDDING_MODEL_KEY are required for embedding")
@@ -666,8 +667,13 @@ def get_embedding(text, embedding_model_endpoint=None, embedding_model_key=None,
             else:
                 api_key = embedding_model_key if embedding_model_key else os.getenv("AZURE_OPENAI_API_KEY")
 
-            client = AzureOpenAI(api_version=api_version, azure_endpoint=base_url, azure_ad_token=api_key)
-            embeddings = client.embeddings.create(model=deployment_id, input=text)
+            client = AzureOpenAI(api_version=api_version, azure_endpoint=base_url, api_key=api_key)
+            if FLAG_AOAI == "V2":
+                embeddings = client.embeddings.create(model=deployment_id, input=text)
+            elif FLAG_AOAI == "V3":   
+                embeddings = client.embeddings.create(model=deployment_id, 
+                                                      input=text, 
+                                                      dimensions=int(os.getenv("VECTOR_DIMENSION", 1536)))
 
             return embeddings.dict()['data'][0]['embedding']
 

diff --git a/scripts/readme.md b/scripts/readme.md
@@ -36,6 +36,9 @@ Disclaimer: Make sure there are no duplicate pages in your data. That could impa
 
      `python data_preparation.py --config config.json --njobs=4`
 
+### Batch creation of index
+Refer to the script run_batch_create_index.py to create multiple indexes in batch using one script.
+
 ## Optional: Use URL prefix
 Each document can be associated with a URL that is stored with each document chunk in the Azure Cognitive Search index in the `url` field. If your documents were downloaded from the web, you can specify a URL prefix to use to construct the document URLs when ingesting your data. Your config file should have an additional `url_prefix` parameter like so:
 

diff --git a/scripts/run_batch_create_index.py b/scripts/run_batch_create_index.py
@@ -0,0 +1,79 @@
+import copy
+import json
+import os
+import subprocess
+import tqdm
+from openai import AzureOpenAI
+from dotenv import load_dotenv
+
+load_dotenv()  
+
+FORM_RECOGNIZER_KEY = os.getenv("FORM_RECOGNIZER_KEY")
+
+with open("./config.json", "r") as f:
+    config = json.loads(f.read())
+
+# this is an example, 
+# it address how to handle subfolders 
+# it also provide option wether to use form recognizer
+run_config_by_data_path_3_small_512_512 = {
+    "aks": "aks_embed_003_small_512_512_index",
+    "azure-docs": {
+        "index": "azure_embed_003_small_512_512_index",
+        "subfolder": "azure-docs",
+    },
+    "test_loranorm": {
+        "index": "test_loranorm_embed_003_small_512_512_index",
+        "form-rec-use-layout": False,
+    },
+
+}
+
+for key, cfg in tqdm.tqdm(run_config_by_data_path_3_small_512_512.items()):
+    # folder is where data is saved
+    folder = os.path.join("/index_data", key)
+
+    if isinstance(cfg, str):
+        index = cfg
+        form_rec_use_layout = True
+    else:
+        index = cfg["index"]
+        form_rec_use_layout = cfg.get("form-rec-use-layout", True)
+        if "subfolder" in cfg:
+            folder = os.path.join(folder, cfg["subfolder"])
+
+
+    config_key = copy.deepcopy(config[0])
+    config_key["data_path"] = os.path.abspath(folder)
+    config_key["index_name"] = index
+
+    print(config_key["data_path"])
+    with open(f"./config.{key}.json", "w") as f:
+        f.write(json.dumps([config_key]))
+
+    command = [
+        "python",
+        "data_preparation.py",
+        "--config",
+        f"config.{key}.json",
+        "--embedding-model-endpoint",
+        '"EMBEDDING_MODEL_ENDPOINT"',
+        "--form-rec-resource",
+        "test-tprompt",
+        "--form-rec-key",
+        FORM_RECOGNIZER_KEY,
+    ] + (["--form-rec-use-layout"] if form_rec_use_layout else []) + [
+        "--njobs=8",
+    ]
+    str_command = " ".join(command)
+    proc = subprocess.run(str_command, capture_output=True)
+    if proc.returncode != 0:
+        print("Error running", command)
+        print(proc.stderr)
+        print(proc.stdout)
+
+
+
+
+
+