Skip to content

Commit

Permalink
add extra dimension support (#865)
Browse files Browse the repository at this point in the history
Co-authored-by: Sophie Chen <sophiechen@microsoft.com>
  • Loading branch information
SophieGarden and Sophie Chen authored May 24, 2024
1 parent 5c01f9c commit 78d0402
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 16 deletions.
1 change: 1 addition & 0 deletions scripts/.env.sample
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# resource switch
FLAG_EMBEDDING_MODEL = "AOAI" # "AOAI" or "COHERE"
FLAG_COHERE = "ENGLISH" # "MULTILINGUAL" or "ENGLISH" options for Cohere embedding models
FLAG_AOAI = "V3" # "V2" or "V3" options for AOAI embedding models

# update vector dimension based on model chosen
VECTOR_DIMENSION = 1536 # change it to desired, e.g., 1536 for AOAI ada 002, 1024 for COHERE
Expand Down
26 changes: 13 additions & 13 deletions scripts/config.json
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[
{
"data_path": "<path to data>",
"location": "<azure region, e.g. 'westus2'>",
"subscription_id": "<subscription id>",
"resource_group": "<resource group name>",
"search_service_name": "<search service name to use or create>",
"index_name": "<index name to use or create>",
"chunk_size": 1024,
"token_overlap": 128,
"semantic_config_name": "default",
"language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"
}
[
{
"data_path": "<path to data>",
"location": "<azure region, e.g. 'westus2'>",
"subscription_id": "<subscription id>",
"resource_group": "<resource group name>",
"search_service_name": "<search service name to use or create>",
"index_name": "<index name to use or create>",
"chunk_size": 1024,
"token_overlap": 128,
"semantic_config_name": "default",
"language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"
}
]
2 changes: 1 addition & 1 deletion scripts/data_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def create_or_update_search_index(
"type": "Collection(Edm.Single)",
"searchable": True,
"retrievable": True,
"dimensions": os.getenv("VECTOR_DIMENSION", 1536),
"dimensions": int(os.getenv("VECTOR_DIMENSION", 1536)),
"vectorSearchConfiguration": vector_config_name
})

Expand Down
10 changes: 8 additions & 2 deletions scripts/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,7 @@ def get_embedding(text, embedding_model_endpoint=None, embedding_model_key=None,

FLAG_EMBEDDING_MODEL = os.getenv("FLAG_EMBEDDING_MODEL", "AOAI")
FLAG_COHERE = os.getenv("FLAG_COHERE", "ENGLISH")
FLAG_AOAI = os.getenv("FLAG_AOAI", "V3")

if azure_credential is None and (endpoint is None or key is None):
raise Exception("EMBEDDING_MODEL_ENDPOINT and EMBEDDING_MODEL_KEY are required for embedding")
Expand All @@ -666,8 +667,13 @@ def get_embedding(text, embedding_model_endpoint=None, embedding_model_key=None,
else:
api_key = embedding_model_key if embedding_model_key else os.getenv("AZURE_OPENAI_API_KEY")

client = AzureOpenAI(api_version=api_version, azure_endpoint=base_url, azure_ad_token=api_key)
embeddings = client.embeddings.create(model=deployment_id, input=text)
client = AzureOpenAI(api_version=api_version, azure_endpoint=base_url, api_key=api_key)
if FLAG_AOAI == "V2":
embeddings = client.embeddings.create(model=deployment_id, input=text)
elif FLAG_AOAI == "V3":
embeddings = client.embeddings.create(model=deployment_id,
input=text,
dimensions=int(os.getenv("VECTOR_DIMENSION", 1536)))

return embeddings.dict()['data'][0]['embedding']

Expand Down
3 changes: 3 additions & 0 deletions scripts/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ Disclaimer: Make sure there are no duplicate pages in your data. That could impa

`python data_preparation.py --config config.json --njobs=4`

### Batch creation of index
Refer to the script run_batch_create_index.py to create multiple indexes in batch using one script.

## Optional: Use URL prefix
Each document can be associated with a URL that is stored with each document chunk in the Azure Cognitive Search index in the `url` field. If your documents were downloaded from the web, you can specify a URL prefix to use to construct the document URLs when ingesting your data. Your config file should have an additional `url_prefix` parameter like so:

Expand Down
79 changes: 79 additions & 0 deletions scripts/run_batch_create_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import copy
import json
import os
import subprocess
import tqdm
from openai import AzureOpenAI
from dotenv import load_dotenv

load_dotenv()

FORM_RECOGNIZER_KEY = os.getenv("FORM_RECOGNIZER_KEY")

with open("./config.json", "r") as f:
config = json.loads(f.read())

# this is an example,
# it address how to handle subfolders
# it also provide option wether to use form recognizer
run_config_by_data_path_3_small_512_512 = {
"aks": "aks_embed_003_small_512_512_index",
"azure-docs": {
"index": "azure_embed_003_small_512_512_index",
"subfolder": "azure-docs",
},
"test_loranorm": {
"index": "test_loranorm_embed_003_small_512_512_index",
"form-rec-use-layout": False,
},

}

for key, cfg in tqdm.tqdm(run_config_by_data_path_3_small_512_512.items()):
# folder is where data is saved
folder = os.path.join("/index_data", key)

if isinstance(cfg, str):
index = cfg
form_rec_use_layout = True
else:
index = cfg["index"]
form_rec_use_layout = cfg.get("form-rec-use-layout", True)
if "subfolder" in cfg:
folder = os.path.join(folder, cfg["subfolder"])


config_key = copy.deepcopy(config[0])
config_key["data_path"] = os.path.abspath(folder)
config_key["index_name"] = index

print(config_key["data_path"])
with open(f"./config.{key}.json", "w") as f:
f.write(json.dumps([config_key]))

command = [
"python",
"data_preparation.py",
"--config",
f"config.{key}.json",
"--embedding-model-endpoint",
'"EMBEDDING_MODEL_ENDPOINT"',
"--form-rec-resource",
"test-tprompt",
"--form-rec-key",
FORM_RECOGNIZER_KEY,
] + (["--form-rec-use-layout"] if form_rec_use_layout else []) + [
"--njobs=8",
]
str_command = " ".join(command)
proc = subprocess.run(str_command, capture_output=True)
if proc.returncode != 0:
print("Error running", command)
print(proc.stderr)
print(proc.stdout)






0 comments on commit 78d0402

Please sign in to comment.