-
Notifications
You must be signed in to change notification settings - Fork 2.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: Sophie Chen <sophiechen@microsoft.com>
- Loading branch information
1 parent
5c01f9c
commit 78d0402
Showing
6 changed files
with
105 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,14 @@ | ||
[ | ||
{ | ||
"data_path": "<path to data>", | ||
"location": "<azure region, e.g. 'westus2'>", | ||
"subscription_id": "<subscription id>", | ||
"resource_group": "<resource group name>", | ||
"search_service_name": "<search service name to use or create>", | ||
"index_name": "<index name to use or create>", | ||
"chunk_size": 1024, | ||
"token_overlap": 128, | ||
"semantic_config_name": "default", | ||
"language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers" | ||
} | ||
[ | ||
{ | ||
"data_path": "<path to data>", | ||
"location": "<azure region, e.g. 'westus2'>", | ||
"subscription_id": "<subscription id>", | ||
"resource_group": "<resource group name>", | ||
"search_service_name": "<search service name to use or create>", | ||
"index_name": "<index name to use or create>", | ||
"chunk_size": 1024, | ||
"token_overlap": 128, | ||
"semantic_config_name": "default", | ||
"language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers" | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import copy | ||
import json | ||
import os | ||
import subprocess | ||
import tqdm | ||
from openai import AzureOpenAI | ||
from dotenv import load_dotenv | ||
|
||
load_dotenv() | ||
|
||
FORM_RECOGNIZER_KEY = os.getenv("FORM_RECOGNIZER_KEY") | ||
|
||
with open("./config.json", "r") as f: | ||
config = json.loads(f.read()) | ||
|
||
# this is an example, | ||
# it address how to handle subfolders | ||
# it also provide option wether to use form recognizer | ||
run_config_by_data_path_3_small_512_512 = { | ||
"aks": "aks_embed_003_small_512_512_index", | ||
"azure-docs": { | ||
"index": "azure_embed_003_small_512_512_index", | ||
"subfolder": "azure-docs", | ||
}, | ||
"test_loranorm": { | ||
"index": "test_loranorm_embed_003_small_512_512_index", | ||
"form-rec-use-layout": False, | ||
}, | ||
|
||
} | ||
|
||
for key, cfg in tqdm.tqdm(run_config_by_data_path_3_small_512_512.items()): | ||
# folder is where data is saved | ||
folder = os.path.join("/index_data", key) | ||
|
||
if isinstance(cfg, str): | ||
index = cfg | ||
form_rec_use_layout = True | ||
else: | ||
index = cfg["index"] | ||
form_rec_use_layout = cfg.get("form-rec-use-layout", True) | ||
if "subfolder" in cfg: | ||
folder = os.path.join(folder, cfg["subfolder"]) | ||
|
||
|
||
config_key = copy.deepcopy(config[0]) | ||
config_key["data_path"] = os.path.abspath(folder) | ||
config_key["index_name"] = index | ||
|
||
print(config_key["data_path"]) | ||
with open(f"./config.{key}.json", "w") as f: | ||
f.write(json.dumps([config_key])) | ||
|
||
command = [ | ||
"python", | ||
"data_preparation.py", | ||
"--config", | ||
f"config.{key}.json", | ||
"--embedding-model-endpoint", | ||
'"EMBEDDING_MODEL_ENDPOINT"', | ||
"--form-rec-resource", | ||
"test-tprompt", | ||
"--form-rec-key", | ||
FORM_RECOGNIZER_KEY, | ||
] + (["--form-rec-use-layout"] if form_rec_use_layout else []) + [ | ||
"--njobs=8", | ||
] | ||
str_command = " ".join(command) | ||
proc = subprocess.run(str_command, capture_output=True) | ||
if proc.returncode != 0: | ||
print("Error running", command) | ||
print(proc.stderr) | ||
print(proc.stdout) | ||
|
||
|
||
|
||
|
||
|
||
|