diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml index 999be845caac31..89cae91851104b 100644 --- a/.github/workflows/windows_vs2019_release.yml +++ b/.github/workflows/windows_vs2019_release.yml @@ -45,7 +45,7 @@ jobs: repo_token: ${{ secrets.GITHUB_TOKEN }} skip_when_only_listed_labels_set: 'docs' skip_when_only_listed_files_changed: '*.md,*.rst,*.png,*.jpg,*.svg,*/layer_tests_summary/*,*/conformance/*' - + - name: Get target branch id: set_target_branch run: | @@ -192,7 +192,7 @@ jobs: sparse-checkout: | src/bindings/js path: 'openvino' - + - name: Download OpenVINO artifacts (JS) uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 with: @@ -223,7 +223,7 @@ jobs: run: call npm test - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v2 + uses: microsoft/setup-msbuild@6fb02220983dee41ce7ae257b6f4d8f9bf5ed4ce # v2 - name: E2E of openvino-node package working-directory: ${{ env.OPENVINO_JS_DIR }}/node diff --git a/.github/workflows/workflows_scans.yml b/.github/workflows/workflows_scans.yml index 1a3d091544e784..ace73b18751606 100644 --- a/.github/workflows/workflows_scans.yml +++ b/.github/workflows/workflows_scans.yml @@ -18,6 +18,37 @@ concurrency: permissions: read-all jobs: + codeql: + name: github_actions_workflows_scan/codeql + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners + # Consider using larger runners for possible analysis time improvements. + runs-on: ubuntu-22.04 + timeout-minutes: 60 + permissions: + security-events: write + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + timeout-minutes: 15 + with: + submodules: 'false' + sparse-checkout: .github/workflows + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 + with: + languages: "actions" + build-mode: "none" + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@df409f7d9260372bd5f19e5b04e83cb3c43714ae # v3.27.9 + with: + category: "/language:actions" + semgrep: name: github_actions_workflows_scan/semgrep runs-on: ubuntu-latest diff --git a/docs/articles_en/about-openvino/release-notes-openvino.rst b/docs/articles_en/about-openvino/release-notes-openvino.rst index f898ddaf42ba03..70af3ce17566e6 100644 --- a/docs/articles_en/about-openvino/release-notes-openvino.rst +++ b/docs/articles_en/about-openvino/release-notes-openvino.rst @@ -105,7 +105,7 @@ Deprecation And Support Using deprecated features and components is not advised. They are available to enable a smooth transition to new solutions and will be discontinued in the future. To keep using discontinued features, you will have to revert to the last LTS OpenVINO version supporting them. -For more details, refer to the `OpenVINO Legacy Features and Components __` +For more details, refer to the `OpenVINO Legacy Features and Components __` page. diff --git a/docs/articles_en/get-started/learn-openvino/openvino-samples/get-started-demos.rst b/docs/articles_en/get-started/learn-openvino/openvino-samples/get-started-demos.rst index f61ccf5cacd2f3..32dde2fd2a324b 100644 --- a/docs/articles_en/get-started/learn-openvino/openvino-samples/get-started-demos.rst +++ b/docs/articles_en/get-started/learn-openvino/openvino-samples/get-started-demos.rst @@ -262,7 +262,7 @@ You need a model that is specific for your inference task. You can get it from o Convert the Model -------------------- -If Your model requires conversion, check the `article `__ for information how to do it. +If Your model requires conversion, check the :doc:`article <../../../openvino-workflow/model-preparation>` for information how to do it. .. _download-media: diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst index 232e0f2c2a66b9..80c98e1b857522 100644 --- a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst @@ -5,8 +5,9 @@ LLM Weight Compression :maxdepth: 1 :hidden: - weight-compression/microscaling-quantization weight-compression/4-bit-weight-quantization + weight-compression/microscaling-quantization + Weight compression enhances the efficiency of models by reducing their memory footprint, @@ -16,14 +17,13 @@ Unlike full model quantization, where both weights and activations are quantized only targets weights, keeping activations as floating-point numbers. This means preserving most of the model's accuracy while improving its speed and reducing its size. The reduction in size is especially noticeable with larger models. -For instance the 7 billion parameter Llama 2 model can be reduced -from about 25GB to 4GB using 4-bit weight compression. +For instance the 8 billion parameter Llama 3 model can be reduced +from about 16.1 GB to 4.8 GB using 4-bit weight quantization on top of bfloat16 model. .. note:: - With smaller language models (i.e. less than 1B parameters), weight + With smaller language models (i.e. less than 1B parameters), low-bit weight compression may result in more accuracy reduction than with larger models. - Therefore, weight compression is recommended for use with LLMs only. LLMs and other GenAI models that require extensive memory to store the weights during inference can benefit @@ -36,7 +36,7 @@ from weight compression as it: * improves inference speed by reducing the latency of memory access when computing the operations with weights, for example, Linear layers. The weights are smaller and thus faster to load from memory; -* unlike quantization, does not require sample data to calibrate the range of +* unlike full static quantization, does not require sample data to calibrate the range of activation values. Currently, `NNCF `__ @@ -64,7 +64,7 @@ by running the following command: pip install optimum[openvino] **8-bit weight quantization** offers a good balance between reducing the size and lowering the -accuracy of a model. It usually results in significant improvements for transformer-based models +accuracy of a model. It usually results in significant improvements for Transformer-based models and guarantees good model performance for a vast majority of supported CPU and GPU platforms. By default, weights are compressed asymmetrically to "INT8_ASYM" mode. @@ -223,17 +223,6 @@ depending on the model. For more details, refer to the article on how to :doc:`infer LLMs using Optimum Intel <../../../openvino-workflow-generative/inference-with-optimum-intel>`. -The code snippet below shows how to do 4-bit quantization of the model weights represented -in OpenVINO IR using NNCF: - -.. tab-set:: - - .. tab-item:: OpenVINO - :sync: openvino - - .. doxygensnippet:: docs/optimization_guide/nncf/code/weight_compression_openvino.py - :language: python - :fragment: [compression_4bit] Refer to the article about :doc:`4-bit weight quantization <./weight-compression/4-bit-weight-quantization>` diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression/4-bit-weight-quantization.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression/4-bit-weight-quantization.rst index ae9bc7d7b8b4a3..3994e5550c4e2f 100644 --- a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression/4-bit-weight-quantization.rst +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression/4-bit-weight-quantization.rst @@ -133,7 +133,12 @@ trade-offs after optimization: There are three modes: INT8_ASYM, INT8_SYM, and NONE, which retains the original floating-point precision of the model weights (``INT8_ASYM`` is default value). -| + + +.. tip:: + + NNCF allows stacking the supported optimization methods. For example, AWQ, Scale Estimation + and GPTQ methods can be enabled all together to achieve better accuracy. 4-bit Weight Quantization with GPTQ ################################### diff --git a/docs/articles_en/openvino-workflow/model-optimization.rst b/docs/articles_en/openvino-workflow/model-optimization.rst index f5a5f97341e960..e44cf556329bd1 100644 --- a/docs/articles_en/openvino-workflow/model-optimization.rst +++ b/docs/articles_en/openvino-workflow/model-optimization.rst @@ -21,24 +21,24 @@ In OpenVINO, the default optimization tool is NNCF (Neural Network Compression F It is a `set of compression algorithms `__, organized as a Python package, that make your models smaller and faster. Note that NNCF is **not part of the OpenVINO package**, so it needs to be installed separately. It supports -models in **PyTorch**, **TensorFlow** , **ONNX**, and **OpenVINO IR** formats, offering +models in **OpenVINO IR**, **PyTorch**, **ONNX**, and **TensorFlow** formats, offering the following main optimizations: .. image:: ../assets/images/WHAT_TO_USE.svg | :doc:`Weight Compression `: -| an easy-to-use method for Large Language Model footprint reduction and inference +| An easy-to-use method for Large Language Model footprint reduction and inference acceleration. | :doc:`Post-training Quantization `: -| designed to optimize deep learning models by applying 8-bit integer quantization. Being +| Designed to optimize deep learning models by applying 8-bit integer quantization. Being the easiest way to optimize a model it does not require its retraining or fine-tuning but may result in a drop in accuracy. If the accuracy-performance tradeoff is not acceptable, Training-time Optimization may be a better option. | :doc:`Training-time Optimization `: -| involves a suite of advanced methods such as Structured or Unstructured Pruning, as well +| Involves a suite of advanced methods such as Structured or Unstructured Pruning, as well as Quantization-aware Training. This kind of optimization requires the use of the model's original framework, for NNCF, it is either PyTorch or TensorFlow. @@ -54,13 +54,7 @@ Recommended workflows 3. If the accuracy drop is unacceptable, use quantization-aware training instead. It will give you the same level of performance boost, with a smaller impact on accuracy. -* **Weight compression** works **only with LLMs**. Do not try to use it with other models. -* For **visual-multimodal** use cases, the encoder / decoder split approach may be recommended. - - - - - +* **Weight compression** works with **LLMs**, **VLMs** and other Transformer-based models. diff --git a/docs/notebooks/convert-to-openvino-with-output.rst b/docs/notebooks/convert-to-openvino-with-output.rst index 8cf851bfcd9c16..23e93612b2cad5 100644 --- a/docs/notebooks/convert-to-openvino-with-output.rst +++ b/docs/notebooks/convert-to-openvino-with-output.rst @@ -54,7 +54,7 @@ OpenVINO IR format OpenVINO `Intermediate Representation -(IR) `__ +(IR) `__ is the proprietary model format of OpenVINO. It is produced after converting a model with model conversion API. Model conversion API translates the frequently used deep learning operations to their diff --git a/docs/notebooks/cross-lingual-books-alignment-with-output.rst b/docs/notebooks/cross-lingual-books-alignment-with-output.rst index b9f4024dcb8f0f..047c9d4a733552 100644 --- a/docs/notebooks/cross-lingual-books-alignment-with-output.rst +++ b/docs/notebooks/cross-lingual-books-alignment-with-output.rst @@ -941,7 +941,7 @@ advance and fill it in as the inference requests are executed. Let’s compare the models and plot the results. **Note**: To get a more accurate benchmark, use the `Benchmark Python - Tool `__ + Tool `__ .. code:: ipython3 diff --git a/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst b/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst index 13da68e4db5001..265707b944aa95 100644 --- a/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst +++ b/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst @@ -623,7 +623,7 @@ Compare Performance of the FP32 IR Model and Quantized Models To measure the inference performance of the ``FP32`` and ``INT8`` models, we use `Benchmark -Tool `__ +Tool `__ - OpenVINO’s inference performance measurement tool. Benchmark tool is a command line application, part of OpenVINO development tools, that can be run in the notebook with ``! benchmark_app`` or diff --git a/docs/notebooks/ddcolor-image-colorization-with-output.rst b/docs/notebooks/ddcolor-image-colorization-with-output.rst index 6215f42113cacd..ef848cc586e016 100644 --- a/docs/notebooks/ddcolor-image-colorization-with-output.rst +++ b/docs/notebooks/ddcolor-image-colorization-with-output.rst @@ -499,7 +499,7 @@ Compare inference time of the FP16 and INT8 models To measure the inference performance of OpenVINO FP16 and INT8 models, use `Benchmark -Tool `__. +Tool `__. **NOTE**: For the most accurate performance estimation, it is recommended to run ``benchmark_app`` in a terminal/command prompt diff --git a/docs/notebooks/depth-anything-v2-with-output.rst b/docs/notebooks/depth-anything-v2-with-output.rst index e5106f8b8781db..f45fe775883264 100644 --- a/docs/notebooks/depth-anything-v2-with-output.rst +++ b/docs/notebooks/depth-anything-v2-with-output.rst @@ -977,7 +977,7 @@ Compare inference time of the FP16 and INT8 models To measure the inference performance of OpenVINO FP16 and INT8 models, use `Benchmark -Tool `__. +Tool `__. **NOTE**: For the most accurate performance estimation, it is recommended to run ``benchmark_app`` in a terminal/command prompt diff --git a/docs/notebooks/depth-anything-with-output.rst b/docs/notebooks/depth-anything-with-output.rst index 49f15f430f9746..144a54d6e6f4ac 100644 --- a/docs/notebooks/depth-anything-with-output.rst +++ b/docs/notebooks/depth-anything-with-output.rst @@ -940,7 +940,7 @@ Compare inference time of the FP16 and INT8 models To measure the inference performance of OpenVINO FP16 and INT8 models, use `Benchmark -Tool `__. +Tool `__. **NOTE**: For the most accurate performance estimation, it is recommended to run ``benchmark_app`` in a terminal/command prompt diff --git a/docs/notebooks/fast-segment-anything-with-output.rst b/docs/notebooks/fast-segment-anything-with-output.rst index 0071e2dca60e74..32915c9a16dc95 100644 --- a/docs/notebooks/fast-segment-anything-with-output.rst +++ b/docs/notebooks/fast-segment-anything-with-output.rst @@ -100,29 +100,29 @@ Imports import ipywidgets as widgets from pathlib import Path - + import openvino as ov import torch from PIL import Image from ultralytics import FastSAM - + # Fetch skip_kernel_extension module import requests - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", ) open("skip_kernel_extension.py", "w").write(r.text) # Fetch `notebook_utils` module import requests - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) - + open("notebook_utils.py", "w").write(r.text) from notebook_utils import download_file, device_widget - + %load_ext skip_kernel_extension FastSAM in Ultralytics @@ -142,7 +142,7 @@ model and generate a segmentation map. model_name = "FastSAM-x" model = FastSAM(model_name) - + # Run inference on an image image_uri = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg" image_uri = download_file(image_uri) @@ -169,7 +169,7 @@ model and generate a segmentation map. .. parsed-literal:: - + image 1/1 /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything/coco_bike.jpg: 768x1024 37 objects, 638.3ms Speed: 3.4ms preprocess, 638.3ms inference, 500.4ms postprocess per image at shape (1, 3, 768, 1024) @@ -210,16 +210,16 @@ tracing. The FastSAM model itself is based on YOLOv8 model. .. parsed-literal:: Ultralytics YOLOv8.2.24 🚀 Python-3.8.10 torch-2.4.1+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) - + PyTorch: starting from 'FastSAM-x.pt' with input shape (1, 3, 1024, 1024) BCHW and output shape(s) ((1, 37, 21504), (1, 32, 256, 256)) (138.3 MB) - + OpenVINO: starting export with openvino 2024.4.0-16579-c3152d32c9c-releases/2024/4... OpenVINO: export success ✅ 6.2s, saved as 'FastSAM-x_openvino_model/' (276.1 MB) - + Export complete (9.2s) Results saved to /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything - Predict: yolo predict task=segment model=FastSAM-x_openvino_model imgsz=1024 - Validate: yolo val task=segment model=FastSAM-x_openvino_model imgsz=1024 data=ultralytics/datasets/sa.yaml + Predict: yolo predict task=segment model=FastSAM-x_openvino_model imgsz=1024 + Validate: yolo val task=segment model=FastSAM-x_openvino_model imgsz=1024 data=ultralytics/datasets/sa.yaml Visualize: https://netron.app @@ -230,7 +230,7 @@ Embedding the converted models into the original pipeline OpenVINO™ Runtime Python API is used to compile the model in OpenVINO IR format. The -`Core `__ +`Core `__ class provides access to the OpenVINO Runtime API. The ``core`` object, which is an instance of the ``Core`` class represents the API and it is used to compile the model. @@ -250,7 +250,7 @@ from the dropdown list: .. code:: ipython3 device = device_widget() - + device @@ -288,12 +288,12 @@ object, so we need to redefine the magic ``__call__`` method. def __init__(self, ov_model, device="CPU", stride=32, ov_config=None) -> None: ov_config = ov_config or {} self.model = core.compile_model(ov_model, device, ov_config) - + self.stride = stride self.pt = False self.fp16 = False self.names = {0: "object"} - + def __call__(self, im, **_): result = self.model(im) return torch.from_numpy(result[0]), torch.from_numpy(result[1]) @@ -306,7 +306,7 @@ pipeline. ov_config = {} if "GPU" in device.value or ("AUTO" in device.value and "GPU" in core.available_devices): ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"} - + wrapped_model = OVWrapper( ov_model_path, device=device.value, @@ -314,13 +314,13 @@ pipeline. ov_config=ov_config, ) model.predictor.model = wrapped_model - + ov_results = model(image_uri, device=device.value, retina_masks=True, imgsz=1024, conf=0.6, iou=0.9) .. parsed-literal:: - + image 1/1 /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything/coco_bike.jpg: 1024x1024 42 objects, 498.5ms Speed: 6.1ms preprocess, 498.5ms inference, 31.6ms postprocess per image at shape (1, 3, 1024, 1024) @@ -363,7 +363,7 @@ The optimization process contains the following steps: description="Quantization", disabled=False, ) - + do_quantize @@ -399,20 +399,20 @@ repo <-with-output.html>`__. .. code:: ipython3 %%skip not $do_quantize.value - + import pickle from contextlib import contextmanager from zipfile import ZipFile - + import cv2 from tqdm.autonotebook import tqdm - + import nncf - - + + COLLECT_CALIBRATION_DATA = False calibration_data = [] - + @contextmanager def calibration_data_collection(): global COLLECT_CALIBRATION_DATA @@ -421,58 +421,58 @@ repo <-with-output.html>`__. yield finally: COLLECT_CALIBRATION_DATA = False - - + + class NNCFWrapper: def __init__(self, ov_model, stride=32) -> None: self.model = core.read_model(ov_model) self.compiled_model = core.compile_model(self.model, device_name="CPU") - + self.stride = stride self.pt = False self.fp16 = False self.names = {0: "object"} - + def __call__(self, im, **_): if COLLECT_CALIBRATION_DATA: calibration_data.append(im) - + result = self.compiled_model(im) return torch.from_numpy(result[0]), torch.from_numpy(result[1]) - + # Fetch data from the web and descibe a dataloader DATA_URL = "https://ultralytics.com/assets/coco128.zip" OUT_DIR = Path('.') - + download_file(DATA_URL, directory=OUT_DIR, show_progress=True) - + if not (OUT_DIR / "coco128/images/train2017").exists(): with ZipFile('coco128.zip', "r") as zip_ref: zip_ref.extractall(OUT_DIR) - + class COCOLoader(torch.utils.data.Dataset): def __init__(self, images_path): self.images = list(Path(images_path).iterdir()) - + def __getitem__(self, index): if isinstance(index, slice): return [self.read_image(image_path) for image_path in self.images[index]] return self.read_image(self.images[index]) - + def read_image(self, image_path): image = cv2.imread(str(image_path)) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) return image - + def __len__(self): return len(self.images) - - + + def collect_calibration_data_for_decoder(model, calibration_dataset_size: int, calibration_cache_path: Path): global calibration_data - - + + if not calibration_cache_path.exists(): coco_dataset = COCOLoader(OUT_DIR / 'coco128/images/train2017') with calibration_data_collection(): @@ -484,10 +484,10 @@ repo <-with-output.html>`__. else: with open(calibration_cache_path, "rb") as f: calibration_data = pickle.load(f) - + return calibration_data - - + + def quantize(model, save_model_path: Path, calibration_cache_path: Path, calibration_dataset_size: int, preset: nncf.QuantizationPreset): calibration_data = collect_calibration_data_for_decoder( @@ -508,10 +508,10 @@ repo <-with-output.html>`__. ) ) ov.save_model(quantized_ov_decoder, save_model_path) - + wrapped_model = NNCFWrapper(ov_model_path, stride=model.predictor.model.stride) model.predictor.model = wrapped_model - + calibration_dataset_size = 128 quantized_model_path = Path(f"{model_name}_quantized") / "FastSAM-x.xml" calibration_cache_path = Path(f"calibration_data/coco{calibration_dataset_size}.pkl") @@ -553,7 +553,7 @@ repo <-with-output.html>`__. INFO:nncf:Not adding activation input quantizer for operation: 347 __module.model.22/aten::add/Add INFO:nncf:Not adding activation input quantizer for operation: 359 __module.model.22/aten::add/Add_1 371 __module.model.22/aten::div/Divide - + INFO:nncf:Not adding activation input quantizer for operation: 360 __module.model.22/aten::sub/Subtract_1 INFO:nncf:Not adding activation input quantizer for operation: 382 __module.model.22/aten::mul/Multiply @@ -595,15 +595,15 @@ calibration dataset to measure the performance. .. code:: ipython3 %%skip not $do_quantize.value - + import datetime - + coco_dataset = COCOLoader(OUT_DIR / 'coco128/images/train2017') calibration_dataset_size = 128 - + wrapped_model = OVWrapper(ov_model_path, device=device.value, stride=model.predictor.model.stride) model.predictor.model = wrapped_model - + start_time = datetime.datetime.now() for image in tqdm(coco_dataset, desc="Measuring inference time"): model(image, retina_masks=True, imgsz=1024, conf=0.6, iou=0.9, verbose=False) @@ -627,10 +627,10 @@ calibration dataset to measure the performance. .. code:: ipython3 %%skip not $do_quantize.value - + quantized_wrapped_model = OVWrapper(quantized_model_path, device=device.value, stride=model.predictor.model.stride) model.predictor.model = quantized_wrapped_model - + start_time = datetime.datetime.now() for image in tqdm(coco_dataset, desc="Measuring inference time"): model(image, retina_masks=True, imgsz=1024, conf=0.6, iou=0.9, verbose=False) @@ -670,8 +670,8 @@ bounding boxes on input image. import cv2 import numpy as np import matplotlib.pyplot as plt - - + + def fast_process( annotations, image, @@ -684,12 +684,12 @@ bounding boxes on input image. ): original_h = image.height original_w = image.width - + if better_quality: for i, mask in enumerate(annotations): mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8)) annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8)) - + inner_mask = fast_show_mask( annotations, plt.gca(), @@ -699,7 +699,7 @@ bounding boxes on input image. target_height=original_h, target_width=original_w, ) - + if with_contours: contour_all = [] temp = np.zeros((original_h, original_w, 1)) @@ -717,18 +717,18 @@ bounding boxes on input image. cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2 // scale) color = np.array([0 / 255, 0 / 255, 255 / 255, 0.9]) contour_mask = temp / 255 * color.reshape(1, 1, -1) - + image = image.convert("RGBA") overlay_inner = Image.fromarray((inner_mask * 255).astype(np.uint8), "RGBA") image.paste(overlay_inner, (0, 0), overlay_inner) - + if with_contours: overlay_contour = Image.fromarray((contour_mask * 255).astype(np.uint8), "RGBA") image.paste(overlay_contour, (0, 0), overlay_contour) - + return image - - + + # CPU post process def fast_show_mask( annotation, @@ -746,7 +746,7 @@ bounding boxes on input image. areas = np.sum(annotation, axis=(1, 2)) sorted_indices = np.argsort(areas)[::1] annotation = annotation[sorted_indices] - + index = (annotation != 0).argmax(axis=0) if random_color: color = np.random.random((mask_sum, 1, 1, 3)) @@ -755,20 +755,20 @@ bounding boxes on input image. transparency = np.ones((mask_sum, 1, 1, 1)) * 0.6 visual = np.concatenate([color, transparency], axis=-1) mask_image = np.expand_dims(annotation, -1) * visual - + mask = np.zeros((height, weight, 4)) - + h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing="ij") indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None)) - + mask[h_indices, w_indices, :] = mask_image[indices] if bbox is not None: x1, y1, x2, y2 = bbox ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor="b", linewidth=1)) - + if not retinamask: mask = cv2.resize(mask, (target_width, target_height), interpolation=cv2.INTER_NEAREST) - + return mask This is the main callback function that is called to segment an image @@ -779,8 +779,8 @@ based on user input. object_points = [] background_points = [] bbox_points = [] - - + + def segment( image, model_type, @@ -796,14 +796,14 @@ based on user input. model.predictor.model = quantized_wrapped_model else: model.predictor.model = wrapped_model - + input_size = int(input_size) w, h = image.size scale = input_size / max(w, h) new_w = int(w * scale) new_h = int(h * scale) image = image.resize((new_w, new_h)) - + results = model( image, retina_masks=use_retina, @@ -811,14 +811,14 @@ based on user input. conf=conf_threshold, imgsz=input_size, ) - + masks = results[0].masks.data # Calculate annotations if not (object_points or bbox_points): annotations = masks.cpu().numpy() else: annotations = [] - + if object_points: all_points = object_points + background_points labels = [1] * len(object_points) + [0] * len(background_points) @@ -841,20 +841,20 @@ based on user input. x = max(min(x, new_w), 0) y = max(min(y, new_h), 0) scaled_bbox_points.append((x, y)) - + for i in range(0, len(scaled_bbox_points) - 1, 2): x0, y0, x1, y1 = *scaled_bbox_points[i], *scaled_bbox_points[i + 1] - + intersection_area = torch.sum(masks[:, y0:y1, x0:x1], dim=(1, 2)) masks_area = torch.sum(masks, dim=(1, 2)) bbox_area = (y1 - y0) * (x1 - x0) - + union = bbox_area + masks_area - intersection_area iou = intersection_area / union max_iou_index = torch.argmax(iou) - + annotations.append(masks[max_iou_index].cpu().numpy()) - + return fast_process( annotations=np.array(annotations), image=image, @@ -871,11 +871,11 @@ based on user input. if not Path("gradio_helper.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/fast-segment-anything/gradio_helper.py") open("gradio_helper.py", "w").write(r.text) - + from gradio_helper import make_demo - + demo = make_demo(fn=segment, quantized=do_quantize.value) - + try: demo.queue().launch(debug=False) except Exception: @@ -888,7 +888,7 @@ based on user input. .. parsed-literal:: Running on local URL: http://127.0.0.1:7860 - + To create a public link, set `share=True` in `launch()`. diff --git a/docs/notebooks/gpu-device-with-output.rst b/docs/notebooks/gpu-device-with-output.rst index b06b55c8af8919..3e10cf4ab7c8bf 100644 --- a/docs/notebooks/gpu-device-with-output.rst +++ b/docs/notebooks/gpu-device-with-output.rst @@ -99,10 +99,10 @@ cards `__. To get started, first `install -OpenVINO `__ +OpenVINO `__ on a system equipped with one or more Intel GPUs. Follow the `GPU configuration -instructions `__ +instructions `__ to configure OpenVINO to work with your GPU. Then, read on to learn how to accelerate inference with GPUs in OpenVINO! @@ -160,7 +160,7 @@ section. If the GPUs are installed correctly on the system and still do not appear in the list, follow the steps described -`here `__ +`here `__ to configure your GPU drivers to work with OpenVINO. Once we have the GPUs working with OpenVINO, we can proceed with the next sections. @@ -509,7 +509,7 @@ hints.PerformanceMode.CUMULATIVE_THROUGHPUT}) \` how to set up an asynchronous pipeline that takes advantage of parallelism to increase throughput.** To learn more, see `Asynchronous - Inferencing `__ + Inferencing `__ in OpenVINO as well as the `Asynchronous Inference notebook `__. @@ -535,7 +535,7 @@ Note that benchmark_app only requires the model path to run but both the device and hint arguments will be useful to us. For more advanced usages, the tool itself has other options that can be checked by running ``benchmark_app -h`` or reading the -`docs `__. +`docs `__. The following example shows how to benchmark a simple model, using a GPU with a latency focus: @@ -1439,8 +1439,8 @@ corresponding documentation: - `Query Device Properties `__ - `Configurations for GPUs with - OpenVINO `__ + OpenVINO `__ - `Benchmark Python - Tool `__ + Tool `__ - `Asynchronous - Inferencing `__ + Inferencing `__ diff --git a/docs/notebooks/hello-npu-with-output.rst b/docs/notebooks/hello-npu-with-output.rst index 0160e5f23c534c..109f78a14f1b28 100644 --- a/docs/notebooks/hello-npu-with-output.rst +++ b/docs/notebooks/hello-npu-with-output.rst @@ -77,7 +77,7 @@ other devices, for more streamlined resource management. Note that the NPU plugin is included in PIP installation of OpenVINO™ and you need to `install a proper NPU -driver `__ +driver `__ to use it successfully. | **Supported Platforms**: @@ -483,7 +483,7 @@ Note that benchmark_app only requires the model path to run but both device and hint arguments will be useful to us. For more advanced usages, the tool itself has other options that can be checked by running ``benchmark_app -h`` or reading the -`docs `__. +`docs `__. The following example shows us to benchmark a simple model, using a NPU with latency focus: diff --git a/docs/notebooks/image-classification-quantization-with-output.rst b/docs/notebooks/image-classification-quantization-with-output.rst index fb3efbdbe9ef1f..0dc9f5dbc766b3 100644 --- a/docs/notebooks/image-classification-quantization-with-output.rst +++ b/docs/notebooks/image-classification-quantization-with-output.rst @@ -398,7 +398,7 @@ Compare Performance of the Original and Quantized Models Finally, measure the inference performance of the ``FP32`` and ``INT8`` models, using `Benchmark -Tool `__ +Tool `__ - an inference performance measurement tool in OpenVINO. **NOTE**: For more accurate performance, it is recommended to run diff --git a/docs/notebooks/knowledge-graphs-conve-with-output.rst b/docs/notebooks/knowledge-graphs-conve-with-output.rst index 55c32a97c97e93..c8a642e7fc4107 100644 --- a/docs/notebooks/knowledge-graphs-conve-with-output.rst +++ b/docs/notebooks/knowledge-graphs-conve-with-output.rst @@ -570,7 +570,7 @@ Benchmark the converted OpenVINO model using benchmark app The OpenVINO toolkit provides a benchmarking application to gauge the platform specific runtime performance that can be obtained under optimal configuration parameters for a given model. For more details refer to: -https://docs.openvino.ai/2024/learn-openvino/openvino-samples/benchmark-tool.html +https://docs.openvino.ai/2025/learn-openvino/openvino-samples/benchmark-tool.html Here, we use the benchmark application to obtain performance estimates under optimal configuration for the knowledge graph model inference. We diff --git a/docs/notebooks/language-quantize-bert-with-output.rst b/docs/notebooks/language-quantize-bert-with-output.rst index 65564423fa67a3..8da52d1cf9230c 100644 --- a/docs/notebooks/language-quantize-bert-with-output.rst +++ b/docs/notebooks/language-quantize-bert-with-output.rst @@ -496,7 +496,7 @@ Frames Per Second (FPS) for images. Finally, measure the inference performance of OpenVINO ``FP32`` and ``INT8`` models. For this purpose, use `Benchmark -Tool `__ +Tool `__ in OpenVINO. **Note**: The ``benchmark_app`` tool is able to measure the diff --git a/docs/notebooks/latent-consistency-models-image-generation-with-output.rst b/docs/notebooks/latent-consistency-models-image-generation-with-output.rst index 37dd96826cd038..fc66a8274d3623 100644 --- a/docs/notebooks/latent-consistency-models-image-generation-with-output.rst +++ b/docs/notebooks/latent-consistency-models-image-generation-with-output.rst @@ -95,11 +95,11 @@ Prerequisites from pathlib import Path import requests - + utility_files = [Path("notebook_utils.py"), Path("skip_kernel_extension.py"), Path("cmd_helper.py")] - + base_utils_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/" - + for utility_file in utility_files: if not utility_file.exists(): r = requests.get(base_utils_url + utility_file.name) @@ -119,7 +119,7 @@ fine-tune of `Stable-Diffusion v1-5 `__ using Latent Consistency Distillation (LCD) approach discussed above. This model is also integrated into -`Diffusers `__ library. +`Diffusers `__ library. Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. This allows us to compare running original Stable Diffusion @@ -129,16 +129,16 @@ and distilled using LCD. The distillation approach efficiently converts a pre-trained guided diffusion model into a latent consistency model by solving an augmented PF-ODE. -For simplifying model export we will utilize Optimum Intel library. +For simplifying model export we will utilize Optimum Intel library. `Optimum Intel `__ is -the interface between the +the interface between the `Transformers `__ and `Diffusers `__ libraries and OpenVINO to accelerate end-to-end pipelines on Intel architectures. It provides ease-to-use `interface `__ for exporting models to `OpenVINO Intermediate Representation -(IR) `__ +(IR) `__ format. The command bellow demonstrates basic command for model export with @@ -165,10 +165,10 @@ this step we will use fp16 as base model export precision. .. code:: ipython3 from cmd_helper import optimum_cli - + model_id = "SimianLuo/LCM_Dreamshaper_v7" model_path = Path(model_id.split("/")[-1] + "_ov") - + if not model_path.exists(): optimum_cli(model_id, model_path, additional_args={"weight-format": "fp16"}) @@ -227,9 +227,9 @@ inference. Select desired inference device from dropdown list bellow. .. code:: ipython3 from notebook_utils import device_widget - + device = device_widget() - + device @@ -244,7 +244,7 @@ inference. Select desired inference device from dropdown list bellow. .. code:: ipython3 from optimum.intel.openvino import OVDiffusionPipeline - + ov_pipe = OVDiffusionPipeline.from_pretrained(model_path, device=device.value) @@ -277,10 +277,10 @@ Now, let’s see model in action .. code:: ipython3 import torch - + prompt = "a beautiful pink unicorn, 8k" num_inference_steps = 4 - + images = ov_pipe( prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0, height=512, width=512, generator=torch.Generator().manual_seed(1234567) ).images @@ -308,7 +308,7 @@ Nice. As you can see, the picture has quite a high definition 🔥. .. code:: ipython3 import gc - + del ov_pipe gc.collect(); @@ -344,11 +344,11 @@ improve model inference speed. .. code:: ipython3 from notebook_utils import quantization_widget - + skip_for_device = "GPU" in device.value to_quantize = quantization_widget(not skip_for_device) int8_model_path = model_path.parent / (model_path.name + "_int8") - + to_quantize @@ -380,36 +380,36 @@ model inputs for calibration we should customize ``CompiledModel``. .. code:: ipython3 %%skip not $to_quantize.value - + import datasets from tqdm.notebook import tqdm from transformers import set_seed from typing import Any, Dict, List import openvino as ov import numpy as np - + set_seed(1) - + class CompiledModelDecorator(ov.CompiledModel): def __init__(self, compiled_model, prob: float, data_cache: List[Any] = None): super().__init__(compiled_model) self.data_cache = data_cache if data_cache else [] self.prob = np.clip(prob, 0, 1) - + def __call__(self, *args, **kwargs): if np.random.rand() >= self.prob: self.data_cache.append(*args) return super().__call__(*args, **kwargs) - + def collect_calibration_data(lcm_pipeline, subset_size: int) -> List[Dict]: original_unet = lcm_pipeline.unet.request lcm_pipeline.unet.request = CompiledModelDecorator(original_unet, prob=0.3) - + dataset = datasets.load_dataset("google-research-datasets/conceptual_captions", split="train", trust_remote_code=True).shuffle(seed=42) lcm_pipeline.set_progress_bar_config(disable=True) safety_checker = lcm_pipeline.safety_checker lcm_pipeline.safety_checker = None - + # Run inference for data collection pbar = tqdm(total=subset_size) diff = 0 @@ -430,7 +430,7 @@ model inputs for calibration we should customize ``CompiledModel``. break pbar.update(collected_subset_size - diff) diff = collected_subset_size - + calibration_dataset = lcm_pipeline.unet.request.data_cache lcm_pipeline.set_progress_bar_config(disable=False) lcm_pipeline.unet.request = original_unet @@ -440,11 +440,11 @@ model inputs for calibration we should customize ``CompiledModel``. .. code:: ipython3 %%skip not $to_quantize.value - + import logging logging.basicConfig(level=logging.WARNING) logger = logging.getLogger(__name__) - + if not int8_model_path.exists(): subset_size = 200 ov_pipe = OVDiffusionPipeline.from_pretrained(model_path, device=device.value) @@ -472,12 +472,12 @@ Create a quantized model from the pre-trained converted OpenVINO model. .. code:: ipython3 %%skip not $to_quantize.value - + import nncf from nncf.scopes import IgnoredScope import shutil core = ov.Core() - + if not int8_model_path.exists(): unet = core.read_model(model_path / "unet/openvino_model.xml") quantized_unet = nncf.quantize( @@ -546,7 +546,7 @@ Create a quantized model from the pre-trained converted OpenVINO model. .. code:: ipython3 %%skip not $to_quantize.value - + int8_pipe = OVDiffusionPipeline.from_pretrained(int8_model_path, device=device.value) Let us check predictions with the quantized UNet using the same input @@ -555,12 +555,12 @@ data. .. code:: ipython3 %%skip not $to_quantize.value - + from IPython.display import display - + prompt = "a beautiful pink unicorn, 8k" num_inference_steps = 4 - + images = int8_pipe( prompt=prompt, num_inference_steps=num_inference_steps, @@ -569,7 +569,7 @@ data. width=512, generator=torch.Generator().manual_seed(1234567) ).images - + display(images[0]) @@ -598,9 +598,9 @@ pipelines, we use median inference time on calibration subset. .. code:: ipython3 %%skip not $to_quantize.value - + import time - + validation_size = 10 calibration_dataset = datasets.load_dataset("google-research-datasets/conceptual_captions", split="train", trust_remote_code=True) validation_data = [] @@ -609,7 +609,7 @@ pipelines, we use median inference time on calibration subset. break prompt = batch["caption"] validation_data.append(prompt) - + def calculate_inference_time(pipeline, calibration_dataset): inference_time = [] pipeline.set_progress_bar_config(disable=True) @@ -632,14 +632,14 @@ pipelines, we use median inference time on calibration subset. .. code:: ipython3 %%skip not $to_quantize.value - + int8_latency = calculate_inference_time(int8_pipe, validation_data) del int8_pipe gc.collect() ov_pipe = OVDiffusionPipeline.from_pretrained(model_path, device=device.value) fp_latency = calculate_inference_time(ov_pipe, validation_data) print(f"Performance speed up: {fp_latency / int8_latency:.3f}") - + del ov_pipe gc.collect(); @@ -658,11 +658,11 @@ Compare UNet file size UNET_OV_PATH = model_path / "unet/openvino_model.xml" UNET_INT8_OV_PATH = int8_model_path / "unet/openvino_model.xml" - + if UNET_INT8_OV_PATH.exists(): fp16_ir_model_size = UNET_OV_PATH.with_suffix(".bin").stat().st_size / 1024 quantized_model_size = UNET_INT8_OV_PATH.with_suffix(".bin").stat().st_size / 1024 - + print(f"FP16 model size: {fp16_ir_model_size:.2f} KB") print(f"INT8 model size: {quantized_model_size:.2f} KB") print(f"Model compression rate: {fp16_ir_model_size / quantized_model_size:.3f}") @@ -722,10 +722,10 @@ generation process. .. code:: ipython3 import ipywidgets as widgets - + int8_can_be_used = int8_model_path.exists() and "GPU" not in device.value use_quantized_model = widgets.Checkbox(value=int8_can_be_used, description="Use INT8 model", disabled=not int8_can_be_used) - + use_quantized_model @@ -740,9 +740,9 @@ generation process. .. code:: ipython3 import openvino_genai as ov_genai - + used_model_path = model_path if not use_quantized_model.value else int8_model_path - + pipe = ov_genai.Text2ImagePipeline(used_model_path, device.value) .. code:: ipython3 @@ -750,30 +750,30 @@ generation process. from PIL import Image import torch import openvino as ov - - + + class Generator(ov_genai.Generator): def __init__(self, seed): ov_genai.Generator.__init__(self) self.generator = torch.Generator(device="cpu").manual_seed(seed) - + def next(self): return torch.randn(1, generator=self.generator, dtype=torch.float32).item() - + def randn_tensor(self, shape: ov.Shape): torch_tensor = torch.randn(list(shape), generator=self.generator, dtype=torch.float32) return ov.Tensor(torch_tensor.numpy()) - - + + prompt = "a beautiful pink unicorn, 8k" num_inference_steps = 4 - + random_generator = Generator(1234567) - + image_tensor = pipe.generate(prompt, width=512, height=512, num_inference_steps=4, num_images_per_prompt=1, generator=random_generator) - + image = Image.fromarray(image_tensor.data[0]) - + image @@ -793,16 +793,16 @@ Interactive demo import random import gradio as gr import numpy as np - + MAX_SEED = np.iinfo(np.int32).max - - + + def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: if randomize_seed: seed = random.randint(0, MAX_SEED) return seed - - + + def generate( prompt: str, seed: int = 0, @@ -828,11 +828,11 @@ Interactive demo url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/latent-consistency-models-image-generation/gradio_helper.py" ) open("gradio_helper.py", "w").write(r.text) - + from gradio_helper import make_demo_lcm - + demo = make_demo_lcm(fn=generate) - + try: demo.queue().launch(debug=False) except Exception: diff --git a/docs/notebooks/llm-chatbot-generate-api-with-output.rst b/docs/notebooks/llm-chatbot-generate-api-with-output.rst index 4a57c2fbbed178..fc343a5f306e50 100644 --- a/docs/notebooks/llm-chatbot-generate-api-with-output.rst +++ b/docs/notebooks/llm-chatbot-generate-api-with-output.rst @@ -682,7 +682,7 @@ is the interface between the and OpenVINO to accelerate end-to-end pipelines on Intel architectures. It provides ease-to-use cli interface for exporting models to `OpenVINO Intermediate Representation -(IR) `__ +(IR) `__ format. .. raw:: html diff --git a/docs/notebooks/llm-chatbot-with-output.rst b/docs/notebooks/llm-chatbot-with-output.rst index e3c7ff97af919a..09a2f0e1a572d6 100644 --- a/docs/notebooks/llm-chatbot-with-output.rst +++ b/docs/notebooks/llm-chatbot-with-output.rst @@ -609,7 +609,7 @@ the interface between the and OpenVINO to accelerate end-to-end pipelines on Intel architectures. It provides ease-to-use cli interface for exporting models to `OpenVINO Intermediate Representation -(IR) `__ +(IR) `__ format. The command bellow demonstrates basic command for model export with diff --git a/docs/notebooks/llm-rag-langchain-with-output.rst b/docs/notebooks/llm-rag-langchain-with-output.rst index 1dec9cb2fb6659..bc5c54abbf2880 100644 --- a/docs/notebooks/llm-rag-langchain-with-output.rst +++ b/docs/notebooks/llm-rag-langchain-with-output.rst @@ -97,22 +97,22 @@ Install required dependencies import os import requests - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) with open("notebook_utils.py", "w") as f: f.write(r.text) - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py", ) open("pip_helper.py", "w").write(r.text) - + from pip_helper import pip_install - + os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - + pip_install("--pre", "-U", "openvino>=2024.2.0", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly") pip_install("--pre", "-U", "openvino-tokenizers[transformers]", "--extra-index-url", "https://storage.openvinotoolkit.org/simple/wheels/nightly") pip_install( @@ -147,16 +147,16 @@ Install required dependencies import requests import shutil import io - + # fetch model configuration - + config_shared_path = Path("../../utils/llm_config.py") config_dst_path = Path("llm_config.py") text_example_en_path = Path("text_example_en.pdf") text_example_cn_path = Path("text_example_cn.pdf") text_example_en = "https://github.com/openvinotoolkit/openvino_notebooks/files/15039728/Platform.Brief_Intel.vPro.with.Intel.Core.Ultra_Final.pdf" text_example_cn = "https://github.com/openvinotoolkit/openvino_notebooks/files/15039713/Platform.Brief_Intel.vPro.with.Intel.Core.Ultra_Final_CH.pdf" - + if not config_dst_path.exists(): if config_shared_path.exists(): try: @@ -175,13 +175,13 @@ Install required dependencies r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") with open("llm_config.py", "w", encoding="utf-8") as f: f.write(r.text) - + if not text_example_en_path.exists(): r = requests.get(url=text_example_en) content = io.BytesIO(r.content) with open("text_example_en.pdf", "wb") as f: f.write(content.read()) - + if not text_example_cn_path.exists(): r = requests.get(url=text_example_cn) content = io.BytesIO(r.content) @@ -262,16 +262,16 @@ quality. SUPPORTED_RERANK_MODELS, SUPPORTED_LLM_MODELS, ) - + model_languages = list(SUPPORTED_LLM_MODELS) - + model_language = widgets.Dropdown( options=model_languages, value=model_languages[0], description="Model Language:", disabled=False, ) - + model_language @@ -286,14 +286,14 @@ quality. .. code:: ipython3 llm_model_ids = [model_id for model_id, model_config in SUPPORTED_LLM_MODELS[model_language.value].items() if model_config.get("rag_prompt_template")] - + llm_model_id = widgets.Dropdown( options=llm_model_ids, value=llm_model_ids[-1], description="Model:", disabled=False, ) - + llm_model_id @@ -317,13 +317,13 @@ quality. `Optimum Intel `__ is -the interface between the +the interface between the `Transformers `__ and `Diffusers `__ libraries and OpenVINO to accelerate end-to-end pipelines on Intel architectures. It provides ease-to-use cli interface for exporting models to `OpenVINO Intermediate Representation -(IR) `__ +(IR) `__ format. The command bellow demonstrates basic command for model export with @@ -374,7 +374,7 @@ sacrifice of the model size and inference latency. .. code:: ipython3 from IPython.display import Markdown, display - + prepare_int4_model = widgets.Checkbox( value=True, description="Prepare INT4 model", @@ -390,7 +390,7 @@ sacrifice of the model size and inference latency. description="Prepare FP16 model", disabled=False, ) - + display(prepare_int4_model) display(prepare_int8_model) display(prepare_fp16_model) @@ -461,8 +461,8 @@ with INT4 precision. fp16_model_dir = Path(llm_model_id.value) / "FP16" int8_model_dir = Path(llm_model_id.value) / "INT8_compressed_weights" int4_model_dir = Path(llm_model_id.value) / "INT4_compressed_weights" - - + + def convert_to_fp16(): if (fp16_model_dir / "openvino_model.xml").exists(): return @@ -474,8 +474,8 @@ with INT4 precision. display(Markdown("**Export command:**")) display(Markdown(f"`{export_command}`")) ! $export_command - - + + def convert_to_int8(): if (int8_model_dir / "openvino_model.xml").exists(): return @@ -488,8 +488,8 @@ with INT4 precision. display(Markdown("**Export command:**")) display(Markdown(f"`{export_command}`")) ! $export_command - - + + def convert_to_int4(): compression_configs = { "zephyr-7b-beta": { @@ -559,7 +559,7 @@ with INT4 precision. "ratio": 0.8, }, } - + model_compression_params = compression_configs.get(llm_model_id.value, compression_configs["default"]) if (int4_model_dir / "openvino_model.xml").exists(): return @@ -577,8 +577,8 @@ with INT4 precision. display(Markdown("**Export command:**")) display(Markdown(f"`{export_command}`")) ! $export_command - - + + if prepare_fp16_model.value: convert_to_fp16() if prepare_int8_model.value: @@ -593,7 +593,7 @@ Let’s compare model size for different compression types fp16_weights = fp16_model_dir / "openvino_model.bin" int8_weights = int8_model_dir / "openvino_model.bin" int4_weights = int4_model_dir / "openvino_model.bin" - + if fp16_weights.exists(): print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB") for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]): @@ -619,14 +619,14 @@ filter them out according the LLM you selected. .. code:: ipython3 embedding_model_id = list(SUPPORTED_EMBEDDING_MODELS[model_language.value]) - + embedding_model_id = widgets.Dropdown( options=embedding_model_id, value=embedding_model_id[0], description="Embedding Model:", disabled=False, ) - + embedding_model_id @@ -656,7 +656,7 @@ OpenVINO embedding model and tokenizer can be exported by export_command_base = "optimum-cli export openvino --model {} --task feature-extraction".format(embedding_model_configuration["model_id"]) export_command = export_command_base + " " + str(embedding_model_id.value) - + if not Path(embedding_model_id.value).exists(): ! $export_command @@ -668,14 +668,14 @@ Convert rerank model using Optimum-CLI .. code:: ipython3 rerank_model_id = list(SUPPORTED_RERANK_MODELS) - + rerank_model_id = widgets.Dropdown( options=rerank_model_id, value=rerank_model_id[0], description="Rerank Model:", disabled=False, ) - + rerank_model_id @@ -706,7 +706,7 @@ task with ``optimum-cli``. export_command_base = "optimum-cli export openvino --model {} --task text-classification".format(rerank_model_configuration["model_id"]) export_command = export_command_base + " " + str(rerank_model_id.value) - + if not Path(rerank_model_id.value).exists(): ! $export_command @@ -726,9 +726,9 @@ Select device for embedding model inference .. code:: ipython3 from notebook_utils import device_widget - + embedding_device = device_widget() - + embedding_device @@ -761,9 +761,9 @@ model to NPU device. .. code:: ipython3 from notebook_utils import optimize_bge_embedding - + USING_NPU = embedding_device.value == "NPU" - + npu_embedding_dir = embedding_model_id.value + "-npu" npu_embedding_path = Path(npu_embedding_dir) / "openvino_model.xml" if USING_NPU and not Path(npu_embedding_dir).exists(): @@ -778,7 +778,7 @@ Select device for rerank model inference .. code:: ipython3 rerank_device = device_widget() - + rerank_device @@ -813,9 +813,9 @@ Select device for LLM model inference .. code:: ipython3 from notebook_utils import device_widget - + llm_device = device_widget("CPU", exclude=["NPU"]) - + llm_device @@ -861,7 +861,7 @@ of LangChain. .. code:: ipython3 from langchain_community.embeddings import OpenVINOBgeEmbeddings - + embedding_model_name = npu_embedding_dir if USING_NPU else embedding_model_id.value batch_size = 1 if USING_NPU else 4 embedding_model_kwargs = {"device": embedding_device.value, "compile": False} @@ -870,7 +870,7 @@ of LangChain. "normalize_embeddings": embedding_model_configuration["normalize_embeddings"], "batch_size": batch_size, } - + embedding = OpenVINOBgeEmbeddings( model_name_or_path=embedding_model_name, model_kwargs=embedding_model_kwargs, @@ -879,7 +879,7 @@ of LangChain. if USING_NPU: embedding.ov_model.reshape(1, 512) embedding.ov_model.compile() - + text = "This is a test document." embedding_result = embedding.embed_query(text) embedding_result[:3] @@ -917,11 +917,11 @@ class of LangChain. .. code:: ipython3 from langchain_community.document_compressors.openvino_rerank import OpenVINOReranker - + rerank_model_name = rerank_model_id.value rerank_model_kwargs = {"device": rerank_device.value} rerank_top_n = 2 - + reranker = OpenVINOReranker( model_name_or_path=rerank_model_name, model_kwargs=rerank_model_kwargs, @@ -953,14 +953,14 @@ inference framework. available_models.append("INT8") if fp16_model_dir.exists(): available_models.append("FP16") - + model_to_run = widgets.Dropdown( options=available_models, value=available_models[0], description="Model to run:", disabled=False, ) - + model_to_run @@ -982,12 +982,12 @@ inference framework. .. code:: ipython3 from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline - + import openvino.properties as props import openvino.properties.hint as hints import openvino.properties.streams as streams - - + + if model_to_run.value == "INT4": model_dir = int4_model_dir elif model_to_run.value == "INT8": @@ -995,17 +995,17 @@ inference framework. else: model_dir = fp16_model_dir print(f"Loading model from {model_dir}") - + ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} - + if "GPU" in llm_device.value and "qwen2-7b-instruct" in llm_model_id.value: ov_config["GPU_ENABLE_SDPA_OPTIMIZATION"] = "NO" - + # On a GPU device a model is executed in FP16 precision. For red-pajama-3b-chat model there known accuracy # issues caused by this, which we avoid by setting precision hint to "f32". if llm_model_id.value == "red-pajama-3b-chat" and "GPU" in core.available_devices and llm_device.value in ["GPU", "AUTO"]: ov_config["INFERENCE_PRECISION_HINT"] = "f32" - + llm = HuggingFacePipeline.from_model_id( model_id=str(model_dir), task="text-generation", @@ -1017,10 +1017,10 @@ inference framework. }, pipeline_kwargs={"max_new_tokens": 2}, ) - + if llm.pipeline.tokenizer.eos_token_id: llm.pipeline.tokenizer.pad_token_id = llm.pipeline.tokenizer.eos_token_id - + llm.invoke("2 + 2 =") @@ -1111,13 +1111,13 @@ The most common full sequence from raw data to answer looks like: UnstructuredPowerPointLoader, UnstructuredWordDocumentLoader, ) - - + + class ChineseTextSplitter(CharacterTextSplitter): def __init__(self, pdf: bool = False, **kwargs): super().__init__(**kwargs) self.pdf = pdf - + def split_text(self, text: str) -> List[str]: if self.pdf: text = re.sub(r"\n{3,}", "\n", text) @@ -1130,16 +1130,16 @@ The most common full sequence from raw data to answer looks like: elif ele: sent_list.append(ele) return sent_list - - + + TEXT_SPLITERS = { "Character": CharacterTextSplitter, "RecursiveCharacter": RecursiveCharacterTextSplitter, "Markdown": MarkdownTextSplitter, "Chinese": ChineseTextSplitter, } - - + + LOADERS = { ".csv": (CSVLoader, {}), ".doc": (UnstructuredWordDocumentLoader, {}), @@ -1154,7 +1154,7 @@ The most common full sequence from raw data to answer looks like: ".pptx": (UnstructuredPowerPointLoader, {}), ".txt": (TextLoader, {"encoding": "utf8"}), } - + if model_language.value == "English": text_example_path = "text_example_en.pdf" else: @@ -1179,72 +1179,72 @@ which will help to create a chain to connect RAG components including: from langchain.retrievers import ContextualCompressionRetriever from threading import Thread import gradio as gr - + stop_tokens = llm_model_configuration.get("stop_tokens") rag_prompt_template = llm_model_configuration["rag_prompt_template"] - - + + class StopOnTokens(StoppingCriteria): def __init__(self, token_ids): self.token_ids = token_ids - + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: for stop_id in self.token_ids: if input_ids[0][-1] == stop_id: return True return False - - + + if stop_tokens is not None: if isinstance(stop_tokens[0], str): stop_tokens = llm.pipeline.tokenizer.convert_tokens_to_ids(stop_tokens) - + stop_tokens = [StopOnTokens(stop_tokens)] - - + + def load_single_document(file_path: str) -> List[Document]: """ helper for loading a single document - + Params: file_path: document path Returns: documents loaded - + """ ext = "." + file_path.rsplit(".", 1)[-1] if ext in LOADERS: loader_class, loader_args = LOADERS[ext] loader = loader_class(file_path, **loader_args) return loader.load() - + raise ValueError(f"File does not exist '{ext}'") - - + + def default_partial_text_processor(partial_text: str, new_text: str): """ helper for updating partially generated answer, used by default - + Params: partial_text: text buffer for storing previosly generated text new_text: text update for the current step Returns: updated text string - + """ partial_text += new_text return partial_text - - + + text_processor = llm_model_configuration.get("partial_text_processor", default_partial_text_processor) - - + + def create_vectordb( docs, spliter_name, chunk_size, chunk_overlap, vector_search_top_k, vector_rerank_top_n, run_rerank, search_method, score_threshold, progress=gr.Progress() ): """ Initialize a vector database - + Params: doc: orignal documents provided by user spliter_name: spliter method @@ -1255,24 +1255,24 @@ which will help to create a chain to connect RAG components including: run_rerank: whether run reranker search_method: top k search method score_threshold: score threshold when selecting 'similarity_score_threshold' method - + """ global db global retriever global combine_docs_chain global rag_chain - + if vector_rerank_top_n > vector_search_top_k: gr.Warning("Search top k must >= Rerank top n") - + documents = [] for doc in docs: if type(doc) is not str: doc = doc.name documents.extend(load_single_document(doc)) - + text_splitter = TEXT_SPLITERS[spliter_name](chunk_size=chunk_size, chunk_overlap=chunk_overlap) - + texts = text_splitter.split_documents(documents) db = FAISS.from_documents(texts, embedding) if search_method == "similarity_score_threshold": @@ -1285,32 +1285,32 @@ which will help to create a chain to connect RAG components including: retriever = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=retriever) prompt = PromptTemplate.from_template(rag_prompt_template) combine_docs_chain = create_stuff_documents_chain(llm, prompt) - + rag_chain = create_retrieval_chain(retriever, combine_docs_chain) - + return "Vector database is Ready" - - + + def update_retriever(vector_search_top_k, vector_rerank_top_n, run_rerank, search_method, score_threshold): """ Update retriever - + Params: vector_search_top_k: Vector search top k vector_rerank_top_n: Search rerank top n run_rerank: whether run reranker search_method: top k search method score_threshold: score threshold when selecting 'similarity_score_threshold' method - + """ global db global retriever global combine_docs_chain global rag_chain - + if vector_rerank_top_n > vector_search_top_k: gr.Warning("Search top k must >= Rerank top n") - + if search_method == "similarity_score_threshold": search_kwargs = {"k": vector_search_top_k, "score_threshold": score_threshold} else: @@ -1320,14 +1320,14 @@ which will help to create a chain to connect RAG components including: retriever = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=retriever) reranker.top_n = vector_rerank_top_n rag_chain = create_retrieval_chain(retriever, combine_docs_chain) - + return "Vector database is Ready" - - + + def bot(history, temperature, top_p, top_k, repetition_penalty, hide_full_prompt, do_rag): """ callback function for running chatbot on submit button click - + Params: history: conversation history temperature: parameter for control the level of creativity in AI-generated text. @@ -1337,7 +1337,7 @@ which will help to create a chain to connect RAG components including: repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text. hide_full_prompt: whether to show searching results in promopt. do_rag: whether do RAG when generating texts. - + """ streamer = TextIteratorStreamer( llm.pipeline.tokenizer, @@ -1356,7 +1356,7 @@ which will help to create a chain to connect RAG components including: ) if stop_tokens is not None: pipeline_kwargs["stopping_criteria"] = StoppingCriteriaList(stop_tokens) - + llm.pipeline_kwargs = pipeline_kwargs if do_rag: t1 = Thread(target=rag_chain.invoke, args=({"input": history[-1][0]},)) @@ -1364,19 +1364,19 @@ which will help to create a chain to connect RAG components including: input_text = rag_prompt_template.format(input=history[-1][0], context="") t1 = Thread(target=llm.invoke, args=(input_text,)) t1.start() - + # Initialize an empty string to store the generated text partial_text = "" for new_text in streamer: partial_text = text_processor(partial_text, new_text) history[-1][1] = partial_text yield history - - + + def request_cancel(): llm.pipeline.model.request.cancel() - - + + # initialize the vector store with example document create_vectordb( [text_example_path], @@ -1406,9 +1406,9 @@ Next we can create a Gradio UI and run demo. if not Path("gradio_helper.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-rag-langchain/gradio_helper.py") open("gradio_helper.py", "w").write(r.text) - + from gradio_helper import make_demo - + demo = make_demo( load_doc_fn=create_vectordb, run_fn=bot, @@ -1417,7 +1417,7 @@ Next we can create a Gradio UI and run demo. model_name=llm_model_id.value, language=model_language.value, ) - + try: demo.queue().launch() except Exception: diff --git a/docs/notebooks/llm-rag-llamaindex-with-output.rst b/docs/notebooks/llm-rag-llamaindex-with-output.rst index b3c7f4e004c7af..cd39804c651665 100644 --- a/docs/notebooks/llm-rag-llamaindex-with-output.rst +++ b/docs/notebooks/llm-rag-llamaindex-with-output.rst @@ -98,22 +98,22 @@ Install required dependencies import os import requests - + os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) with open("notebook_utils.py", "w") as f: f.write(r.text) - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py", ) open("pip_helper.py", "w").write(r.text) - + from pip_helper import pip_install - + pip_install( "-q", "--extra-index-url", @@ -149,16 +149,16 @@ Install required dependencies import requests import shutil import io - + # fetch model configuration - + config_shared_path = Path("../../utils/llm_config.py") config_dst_path = Path("llm_config.py") text_example_en_path = Path("text_example_en.pdf") text_example_cn_path = Path("text_example_cn.pdf") text_example_en = "https://github.com/openvinotoolkit/openvino_notebooks/files/15039728/Platform.Brief_Intel.vPro.with.Intel.Core.Ultra_Final.pdf" text_example_cn = "https://github.com/openvinotoolkit/openvino_notebooks/files/15039713/Platform.Brief_Intel.vPro.with.Intel.Core.Ultra_Final_CH.pdf" - + if not config_dst_path.exists(): if config_shared_path.exists(): try: @@ -177,14 +177,14 @@ Install required dependencies r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") with open("llm_config.py", "w", encoding="utf-8") as f: f.write(r.text) - - + + if not text_example_en_path.exists(): r = requests.get(url=text_example_en) content = io.BytesIO(r.content) with open("text_example_en.pdf", "wb") as f: f.write(content.read()) - + if not text_example_cn_path.exists(): r = requests.get(url=text_example_cn) content = io.BytesIO(r.content) @@ -260,16 +260,16 @@ quality. SUPPORTED_RERANK_MODELS, SUPPORTED_LLM_MODELS, ) - + model_languages = list(SUPPORTED_LLM_MODELS) - + model_language = widgets.Dropdown( options=model_languages, value=model_languages[0], description="Model Language:", disabled=False, ) - + model_language @@ -284,14 +284,14 @@ quality. .. code:: ipython3 llm_model_ids = [model_id for model_id, model_config in SUPPORTED_LLM_MODELS[model_language.value].items() if model_config.get("rag_prompt_template")] - + llm_model_id = widgets.Dropdown( options=llm_model_ids, value=llm_model_ids[-1], description="Model:", disabled=False, ) - + llm_model_id @@ -315,13 +315,13 @@ quality. `Optimum Intel `__ is -the interface between the +the interface between the `Transformers `__ and `Diffusers `__ libraries and OpenVINO to accelerate end-to-end pipelines on Intel architectures. It provides ease-to-use cli interface for exporting models to `OpenVINO Intermediate Representation -(IR) `__ +(IR) `__ format. The command bellow demonstrates basic command for model export with @@ -372,7 +372,7 @@ sacrifice of the model size and inference latency. .. code:: ipython3 from IPython.display import Markdown, display - + prepare_int4_model = widgets.Checkbox( value=True, description="Prepare INT4 model", @@ -388,7 +388,7 @@ sacrifice of the model size and inference latency. description="Prepare FP16 model", disabled=False, ) - + display(prepare_int4_model) display(prepare_int8_model) display(prepare_fp16_model) @@ -459,8 +459,8 @@ with INT4 precision. fp16_model_dir = Path(llm_model_id.value) / "FP16" int8_model_dir = Path(llm_model_id.value) / "INT8_compressed_weights" int4_model_dir = Path(llm_model_id.value) / "INT4_compressed_weights" - - + + def convert_to_fp16(): if (fp16_model_dir / "openvino_model.xml").exists(): return @@ -472,8 +472,8 @@ with INT4 precision. display(Markdown("**Export command:**")) display(Markdown(f"`{export_command}`")) ! $export_command - - + + def convert_to_int8(): if (int8_model_dir / "openvino_model.xml").exists(): return @@ -486,8 +486,8 @@ with INT4 precision. display(Markdown("**Export command:**")) display(Markdown(f"`{export_command}`")) ! $export_command - - + + def convert_to_int4(): compression_configs = { "zephyr-7b-beta": { @@ -557,7 +557,7 @@ with INT4 precision. "ratio": 0.8, }, } - + model_compression_params = compression_configs.get(llm_model_id.value, compression_configs["default"]) if (int4_model_dir / "openvino_model.xml").exists(): return @@ -575,8 +575,8 @@ with INT4 precision. display(Markdown("**Export command:**")) display(Markdown(f"`{export_command}`")) ! $export_command - - + + if prepare_fp16_model.value: convert_to_fp16() if prepare_int8_model.value: @@ -591,7 +591,7 @@ Let’s compare model size for different compression types fp16_weights = fp16_model_dir / "openvino_model.bin" int8_weights = int8_model_dir / "openvino_model.bin" int4_weights = int4_model_dir / "openvino_model.bin" - + if fp16_weights.exists(): print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB") for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]): @@ -617,14 +617,14 @@ filter them out according the LLM you selected. .. code:: ipython3 embedding_model_id = list(SUPPORTED_EMBEDDING_MODELS[model_language.value]) - + embedding_model_id = widgets.Dropdown( options=embedding_model_id, value=embedding_model_id[0], description="Embedding Model:", disabled=False, ) - + embedding_model_id @@ -654,7 +654,7 @@ OpenVINO embedding model and tokenizer can be exported by export_command_base = "optimum-cli export openvino --model {} --task feature-extraction".format(embedding_model_configuration["model_id"]) export_command = export_command_base + " " + str(embedding_model_id.value) - + if not Path(embedding_model_id.value).exists(): ! $export_command @@ -666,14 +666,14 @@ Convert rerank model using Optimum-CLI .. code:: ipython3 rerank_model_id = list(SUPPORTED_RERANK_MODELS) - + rerank_model_id = widgets.Dropdown( options=rerank_model_id, value=rerank_model_id[0], description="Rerank Model:", disabled=False, ) - + rerank_model_id @@ -704,7 +704,7 @@ task with ``optimum-cli``. export_command_base = "optimum-cli export openvino --model {} --task text-classification".format(rerank_model_configuration["model_id"]) export_command = export_command_base + " " + str(rerank_model_id.value) - + if not Path(rerank_model_id.value).exists(): ! $export_command @@ -724,7 +724,7 @@ Select device for embedding model inference .. code:: ipython3 embedding_device = device_widget() - + embedding_device @@ -752,10 +752,10 @@ model to NPU device. .. code:: ipython3 USING_NPU = embedding_device.value == "NPU" - + npu_embedding_dir = embedding_model_id.value + "-npu" npu_embedding_path = Path(npu_embedding_dir) / "openvino_model.xml" - + if USING_NPU and not Path(npu_embedding_dir).exists(): shutil.copytree(embedding_model_id.value, npu_embedding_dir) optimize_bge_embedding(Path(embedding_model_id.value) / "openvino_model.xml", npu_embedding_path) @@ -768,7 +768,7 @@ Select device for rerank model inference .. code:: ipython3 rerank_device = device_widget() - + rerank_device @@ -836,17 +836,17 @@ class of LlamaIndex. .. code:: ipython3 from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding - + embedding_model_name = npu_embedding_dir if USING_NPU else embedding_model_id.value batch_size = 1 if USING_NPU else 4 - + embedding = OpenVINOEmbedding( model_id_or_path=embedding_model_name, embed_batch_size=batch_size, device=embedding_device.value, model_kwargs={"compile": False} ) if USING_NPU: embedding._model.reshape(1, 512) embedding._model.compile() - + embeddings = embedding.get_text_embedding("Hello World!") print(len(embeddings)) print(embeddings[:5]) @@ -877,7 +877,7 @@ class of LlamaIndex. .. code:: ipython3 from llama_index.postprocessor.openvino_rerank import OpenVINORerank - + reranker = OpenVINORerank(model_id_or_path=rerank_model_id.value, device=rerank_device.value, top_n=2) @@ -905,14 +905,14 @@ inference framework. available_models.append("INT8") if fp16_model_dir.exists(): available_models.append("FP16") - + model_to_run = widgets.Dropdown( options=available_models, value=available_models[0], description="Model to run:", disabled=False, ) - + model_to_run @@ -932,12 +932,12 @@ inference on it. .. code:: ipython3 from llama_index.llms.openvino import OpenVINOLLM - + import openvino.properties as props import openvino.properties.hint as hints import openvino.properties.streams as streams - - + + if model_to_run.value == "INT4": model_dir = int4_model_dir elif model_to_run.value == "INT8": @@ -945,20 +945,20 @@ inference on it. else: model_dir = fp16_model_dir print(f"Loading model from {model_dir}") - + ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} - + stop_tokens = llm_model_configuration.get("stop_tokens") completion_to_prompt = llm_model_configuration.get("completion_to_prompt") - + if "GPU" in llm_device.value and "qwen2-7b-instruct" in llm_model_id.value: ov_config["GPU_ENABLE_SDPA_OPTIMIZATION"] = "NO" - + # On a GPU device a model is executed in FP16 precision. For red-pajama-3b-chat model there known accuracy # issues caused by this, which we avoid by setting precision hint to "f32". if llm_model_id.value == "red-pajama-3b-chat" and "GPU" in core.available_devices and llm_device.value in ["GPU", "AUTO"]: ov_config["INFERENCE_PRECISION_HINT"] = "f32" - + llm = OpenVINOLLM( model_id_or_path=str(model_dir), context_window=3900, @@ -968,7 +968,7 @@ inference on it. completion_to_prompt=completion_to_prompt, device_map=llm_device.value, ) - + response = llm.complete("2 + 2 =") print(str(response)) @@ -976,7 +976,7 @@ inference on it. .. parsed-literal:: /home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.11/site-packages/pydantic/_internal/_fields.py:161: UserWarning: Field "model_id" has conflict with protected namespace "model_". - + You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`. warnings.warn( @@ -1053,42 +1053,42 @@ The most common full sequence from raw data to answer looks like: from transformers import StoppingCriteria, StoppingCriteriaList import faiss import torch - + if model_language.value == "English": text_example_path = "text_example_en.pdf" else: text_example_path = "text_example_cn.pdf" - - + + class StopOnTokens(StoppingCriteria): def __init__(self, token_ids): self.token_ids = token_ids - + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: for stop_id in self.token_ids: if input_ids[0][-1] == stop_id: return True return False - - + + if stop_tokens is not None: if isinstance(stop_tokens[0], str): stop_tokens = llm._tokenizer.convert_tokens_to_ids(stop_tokens) stop_tokens = [StopOnTokens(stop_tokens)] - + loader = PyMuPDFReader() documents = loader.load(file_path=text_example_path) - + # dimensions of embedding model d = embedding._model.request.outputs[0].get_partial_shape()[2].get_length() faiss_index = faiss.IndexFlatL2(d) Settings.embed_model = embedding - + llm.max_new_tokens = 2048 if stop_tokens is not None: llm._stopping_criteria = StoppingCriteriaList(stop_tokens) Settings.llm = llm - + vector_store = FaissVectorStore(faiss_index=faiss_index) storage_context = StorageContext.from_defaults(vector_store=vector_store) index = VectorStoreIndex.from_documents( @@ -1116,7 +1116,7 @@ The most common full sequence from raw data to answer looks like: query = "What can Intel vPro® Enterprise systems offer?" else: query = "英特尔博锐® Enterprise系统提供哪些功能?" - + streaming_response = query_engine.query(query) streaming_response.print_response_stream() @@ -1131,19 +1131,19 @@ The most common full sequence from raw data to answer looks like: .. parsed-literal:: - - + + Intel vPro® Enterprise systems can offer a range of advanced security features to protect network infrastructure. These include network security appliances, secure access service edge (SASE), next-generation firewall (NGFW), real-time deep packet inspection, antivirus, intrusion prevention and detection, and SSL/TLS inspection. These systems support more devices, users, and key capabilities such as real-time threat detection while processing higher network throughput. They also drive advanced security features for growing network infrastructure with enhanced power efficiency and density. - + Intel QuickAssist Technology (Intel QAT) accelerates and offloads key encryption/compression workloads from the CPU to free up CPU cycles. Trusted execution environments (TEEs) with Intel Software Guard Extensions (Intel SGX) and Intel Trust Domain Extensions (Intel TDX) help protect network workloads and encryption keys across edge-to-cloud infrastructure. - + In industrial and energy sectors, Intel vPro® Enterprise systems improve manageability and help reduce the operational costs of automation and control systems. Hardened platforms ensure system reliability in extreme conditions, and high core density provides more dedicated resources to VMs. - + Intel vPro® Enterprise systems also offer higher performance per watt, one-core density, and faster DDR5 memory bandwidth to enhance throughput and efficiency for edge security workloads. Intel QuickAssist Technology (Intel QAT) accelerates and offloads key encryption/compression workloads from the CPU to free up CPU cycles. Trusted execution environments (TEEs) with Intel Software Guard Extensions (Intel SGX) and Intel Trust Domain Extensions (Intel TDX) harden platforms from unauthorized access. - + Cache Allocation Technology (CAT) within the Intel® Resource Director Technology (Intel® RDT) framework enables performance prioritization for key applications to help meet real-time deterministic requirements. - - + + Gradio Demo @@ -1159,16 +1159,16 @@ First we can check the default prompt template in LlamaIndex pipeline. .. code:: ipython3 prompts_dict = query_engine.get_prompts() - - + + def display_prompt_dict(prompts_dict): for k, p in prompts_dict.items(): text_md = f"**Prompt Key**: {k}
" f"**Text:**
" display(Markdown(text_md)) print(p.get_template()) display(Markdown("

")) - - + + display_prompt_dict(prompts_dict) @@ -1184,7 +1184,7 @@ First we can check the default prompt template in LlamaIndex pipeline. --------------------- Given the context information and not prior knowledge, answer the query. Query: {query_str} - Answer: + Answer: @@ -1204,7 +1204,7 @@ First we can check the default prompt template in LlamaIndex pipeline. {context_msg} ------------ Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer. - Refined Answer: + Refined Answer: @@ -1216,35 +1216,35 @@ First we can check the default prompt template in LlamaIndex pipeline. from langchain.text_splitter import RecursiveCharacterTextSplitter from llama_index.core.node_parser import LangchainNodeParser import gradio as gr - + TEXT_SPLITERS = { "SentenceSplitter": SentenceSplitter, "RecursiveCharacter": RecursiveCharacterTextSplitter, } - - + + def default_partial_text_processor(partial_text: str, new_text: str): """ helper for updating partially generated answer, used by default - + Params: partial_text: text buffer for storing previosly generated text new_text: text update for the current step Returns: updated text string - + """ partial_text += new_text return partial_text - - + + text_processor = llm_model_configuration.get("partial_text_processor", default_partial_text_processor) - - + + def create_vectordb(doc, spliter_name, chunk_size, chunk_overlap, vector_search_top_k, vector_rerank_top_n, run_rerank): """ Initialize a vector database - + Params: doc: orignal documents provided by user chunk_size: size of a single sentence chunk @@ -1252,14 +1252,14 @@ First we can check the default prompt template in LlamaIndex pipeline. vector_search_top_k: Vector search top k vector_rerank_top_n: Rerrank top n run_rerank: whether to run reranker - + """ global query_engine global index - + if vector_rerank_top_n > vector_search_top_k: gr.Warning("Search top k must >= Rerank top n") - + loader = PyMuPDFReader() documents = loader.load(file_path=doc.name) spliter = TEXT_SPLITERS[spliter_name](chunk_size=chunk_size, chunk_overlap=chunk_overlap) @@ -1268,7 +1268,7 @@ First we can check the default prompt template in LlamaIndex pipeline. faiss_index = faiss.IndexFlatL2(d) vector_store = FaissVectorStore(faiss_index=faiss_index) storage_context = StorageContext.from_defaults(vector_store=vector_store) - + index = VectorStoreIndex.from_documents( documents, storage_context=storage_context, @@ -1279,37 +1279,37 @@ First we can check the default prompt template in LlamaIndex pipeline. query_engine = index.as_query_engine(streaming=True, similarity_top_k=vector_search_top_k, node_postprocessors=[reranker]) else: query_engine = index.as_query_engine(streaming=True, similarity_top_k=vector_search_top_k) - + return "Vector database is Ready" - - + + def update_retriever(vector_search_top_k, vector_rerank_top_n, run_rerank): """ Update retriever - + Params: vector_search_top_k: size of searching results vector_rerank_top_n: size of rerank results run_rerank: whether run rerank step - + """ global query_engine global index - + if vector_rerank_top_n > vector_search_top_k: gr.Warning("Search top k must >= Rerank top n") - + if run_rerank: reranker.top_n = vector_rerank_top_n query_engine = index.as_query_engine(streaming=True, similarity_top_k=vector_search_top_k, node_postprocessors=[reranker]) else: query_engine = index.as_query_engine(streaming=True, similarity_top_k=vector_search_top_k) - - + + def bot(history, temperature, top_p, top_k, repetition_penalty, do_rag): """ callback function for running chatbot on submit button click - + Params: history: conversation history temperature: parameter for control the level of creativity in AI-generated text. @@ -1318,7 +1318,7 @@ First we can check the default prompt template in LlamaIndex pipeline. top_k: parameter for control the range of tokens considered by the AI model based on their cumulative probability, selecting number of tokens with highest probability. repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text. do_rag: whether do RAG when generating texts. - + """ llm.generate_kwargs = dict( temperature=temperature, @@ -1327,7 +1327,7 @@ First we can check the default prompt template in LlamaIndex pipeline. top_k=top_k, repetition_penalty=repetition_penalty, ) - + partial_text = "" if do_rag: streaming_response = query_engine.query(history[-1][0]) @@ -1341,8 +1341,8 @@ First we can check the default prompt template in LlamaIndex pipeline. partial_text = text_processor(partial_text, new_text.delta) history[-1][1] = partial_text yield history - - + + def request_cancel(): llm._model.request.cancel() @@ -1351,9 +1351,9 @@ First we can check the default prompt template in LlamaIndex pipeline. if not Path("gradio_helper.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-rag-llamaindex/gradio_helper.py") open("gradio_helper.py", "w").write(r.text) - + from gradio_helper import make_demo - + demo = make_demo( load_doc_fn=create_vectordb, run_fn=bot, @@ -1362,7 +1362,7 @@ First we can check the default prompt template in LlamaIndex pipeline. model_name=llm_model_id.value, language=model_language.value, ) - + try: demo.queue().launch() except Exception: diff --git a/docs/notebooks/mllama-3.2-with-output.rst b/docs/notebooks/mllama-3.2-with-output.rst index 14a7a819a8b5ac..1e1417b476feb9 100644 --- a/docs/notebooks/mllama-3.2-with-output.rst +++ b/docs/notebooks/mllama-3.2-with-output.rst @@ -514,7 +514,7 @@ blog `__ + API `__ provides functionality for low-level GPU memory management, we can use this feature for sharing cross-attention keys and values between Image Encoder and Language Model. diff --git a/docs/notebooks/model-server-with-output.rst b/docs/notebooks/model-server-with-output.rst index d5a9347a46e807..34b4570439a6a7 100644 --- a/docs/notebooks/model-server-with-output.rst +++ b/docs/notebooks/model-server-with-output.rst @@ -104,10 +104,10 @@ image and a message. .. parsed-literal:: - + Hello from Docker! This message shows that your installation appears to be working correctly. - + To generate this message, Docker took the following steps: 1. The Docker client contacted the Docker daemon. 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. @@ -116,16 +116,16 @@ image and a message. executable that produces the output you are currently reading. 4. The Docker daemon streamed that output to the Docker client, which sent it to your terminal. - + To try something more ambitious, you can run an Ubuntu container with: $ docker run -it ubuntu bash - + Share images, automate workflows, and more with a free Docker ID: https://hub.docker.com/ - + For more examples and ideas, visit: https://docs.docker.com/get-started/ - + Step 2: Preparing a Model Repository @@ -186,21 +186,21 @@ following rules: .. code:: ipython3 import os - + # Fetch `notebook_utils` module import requests - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) - + open("notebook_utils.py", "w").write(r.text) from notebook_utils import download_file - + dedicated_dir = "models" model_name = "detection" model_version = "1" - + MODEL_DIR = f"{dedicated_dir}/{model_name}/{model_version}" XML_PATH = "horizontal-text-detection-0001.xml" BIN_PATH = "horizontal-text-detection-0001.bin" @@ -211,7 +211,7 @@ following rules: model_bin_url = ( "https://storage.openvinotoolkit.org/repositories/open_model_zoo/2022.3/models_bin/1/horizontal-text-detection-0001/FP32/horizontal-text-detection-0001.bin" ) - + download_file(model_xml_url, XML_PATH, MODEL_DIR) download_file(model_bin_url, BIN_PATH, MODEL_DIR) @@ -246,14 +246,14 @@ Searching for an available serving port in local. .. code:: ipython3 import socket - + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind(("localhost", 0)) sock.listen(1) port = sock.getsockname()[1] sock.close() print(f"Port {port} is available") - + os.environ["port"] = str(port) @@ -286,7 +286,7 @@ Check whether the OVMS container is running normally: The required Model Server parameters are listed below. For additional configuration options, see the `Model Server Parameters -section `__. +section `__. .. raw:: html @@ -754,7 +754,7 @@ Request Model Status .. code:: ipython3 address = "localhost:" + str(port) - + # Bind the grpc address to the client object client = make_grpc_client(address) model_status = client.get_model_status(model_name=model_name) @@ -794,16 +794,16 @@ Load input image "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/intel_rnb.jpg", directory="data", ) - + # Text detection models expect an image in BGR format. image = cv2.imread(str(image_filename)) fp_image = image.astype("float32") - + # Resize the image to meet network expected input sizes. input_shape = model_metadata["inputs"]["image"]["shape"] height, width = input_shape[2], input_shape[3] resized_image = cv2.resize(fp_image, (height, width)) - + # Reshape to the network input shape. input_image = np.expand_dims(resized_image.transpose(2, 0, 1), 0) plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) @@ -835,10 +835,10 @@ Request Prediction on a Numpy Array .. code:: ipython3 inputs = {"image": input_image} - + # Run inference on model server and receive the result data boxes = client.predict(inputs=inputs, model_name=model_name)["boxes"] - + # Remove zero only boxes. boxes = boxes[~np.all(boxes == 0, axis=1)] print(boxes) @@ -866,17 +866,17 @@ Visualization def convert_result_to_image(bgr_image, resized_image, boxes, threshold=0.3, conf_labels=True): # Define colors for boxes and descriptions. colors = {"red": (255, 0, 0), "green": (0, 255, 0)} - + # Fetch the image shapes to calculate a ratio. (real_y, real_x), (resized_y, resized_x) = ( bgr_image.shape[:2], resized_image.shape[:2], ) ratio_x, ratio_y = real_x / resized_x, real_y / resized_y - + # Convert the base image from BGR to RGB format. rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB) - + # Iterate through non-zero boxes. for box in boxes: # Pick a confidence factor from the last place in an array. @@ -888,10 +888,10 @@ Visualization (x_min, y_min, x_max, y_max) = [ (int(max(corner_position * ratio_y, 10)) if idx % 2 else int(corner_position * ratio_x)) for idx, corner_position in enumerate(box[:-1]) ] - + # Draw a box based on the position, parameters in rectangle function are: image, start_point, end_point, color, thickness. rgb_image = cv2.rectangle(rgb_image, (x_min, y_min), (x_max, y_max), colors["green"], 3) - + # Add text to the image based on position and confidence. # Parameters in text function are: image, text, bottom-left_corner_textfield, font, font_scale, color, thickness, line_type. if conf_labels: @@ -905,7 +905,7 @@ Visualization 1, cv2.LINE_AA, ) - + return rgb_image .. code:: ipython3 @@ -946,6 +946,6 @@ References 1. `OpenVINO™ Model Server - documentation `__ + documentation `__ 2. `OpenVINO™ Model Server GitHub repository `__ diff --git a/docs/notebooks/multilora-image-generation-with-output.rst b/docs/notebooks/multilora-image-generation-with-output.rst index e2da1edafdd8f6..cba6380442b7ca 100644 --- a/docs/notebooks/multilora-image-generation-with-output.rst +++ b/docs/notebooks/multilora-image-generation-with-output.rst @@ -116,7 +116,7 @@ and OpenVINO to accelerate end-to-end pipelines on Intel architectures. It provides ease-to-use `interface `__ for exporting models to `OpenVINO Intermediate Representation -(IR) `__ +(IR) `__ format. Applying LoRA to Original Diffusers pipeline before conversion diff --git a/docs/notebooks/music-generation-with-output.rst b/docs/notebooks/music-generation-with-output.rst index 463223b51bdf6a..afe971c8b7002a 100644 --- a/docs/notebooks/music-generation-with-output.rst +++ b/docs/notebooks/music-generation-with-output.rst @@ -542,7 +542,7 @@ Embedding the converted models into the original pipeline OpenVINO™ Runtime Python API is used to compile the model in OpenVINO IR format. The -`Core `__ +`Core `__ class provides access to the OpenVINO Runtime API. The ``core`` object, which is an instance of the ``Core`` class represents the API and it is used to compile the model. diff --git a/docs/notebooks/optimize-preprocessing-with-output.rst b/docs/notebooks/optimize-preprocessing-with-output.rst index 81e94ff3595923..a22d06d19ddee8 100644 --- a/docs/notebooks/optimize-preprocessing-with-output.rst +++ b/docs/notebooks/optimize-preprocessing-with-output.rst @@ -260,7 +260,7 @@ Graph modifications of a model shall be performed after the model is read from a drive and before it is loaded on the actual device. Pre-processing support following operations (please, see more details -`here `__) +`here `__) - Mean/Scale Normalization - Converting Precision @@ -295,7 +295,7 @@ Create ``PrePostProcessor`` Object The -`PrePostProcessor() `__ +`PrePostProcessor() `__ class enables specifying the preprocessing and postprocessing steps for a model. @@ -320,7 +320,7 @@ about user’s input tensor will be initialized to same data (type/shape/etc) as model’s input parameter. User application can override particular parameters according to application’s data. Refer to the following -`page `__ +`page `__ for more information about parameters for overriding. Below is all the specified input information: diff --git a/src/common/util/include/openvino/util/common_util.hpp b/src/common/util/include/openvino/util/common_util.hpp index a11adf29cd14f1..15ec5d8f27d588 100644 --- a/src/common/util/include/openvino/util/common_util.hpp +++ b/src/common/util/include/openvino/util/common_util.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include #include @@ -173,5 +174,11 @@ inline void erase_if(Container& data, const PredicateT& predicate) { std::string filter_lines_by_prefix(const std::string& str, const std::string& prefix); +template +constexpr std::array, std::common_type_t, T>, sizeof...(Args)> make_array( + Args&&... args) { + return {std::forward(args)...}; +} + } // namespace util } // namespace ov diff --git a/src/core/include/openvino/core/type/element_type.hpp b/src/core/include/openvino/core/type/element_type.hpp index 960b318b81262c..b454d886107e7c 100644 --- a/src/core/include/openvino/core/type/element_type.hpp +++ b/src/core/include/openvino/core/type/element_type.hpp @@ -70,11 +70,11 @@ enum class Type_t { /// \ingroup ov_element_cpp_api class OPENVINO_API Type { public: - Type() = default; - Type(const Type&) = default; + constexpr Type() = default; + constexpr Type(const Type&) = default; constexpr Type(const Type_t t) : m_type{t} {} explicit Type(const std::string& type); - Type& operator=(const Type&) = default; + constexpr Type& operator=(const Type&) = default; std::string c_type_string() const; size_t size() const; size_t hash() const; @@ -95,6 +95,8 @@ class OPENVINO_API Type { // The name of this type, the enum name of this type std::string get_type_name() const; friend OPENVINO_API std::ostream& operator<<(std::ostream&, const Type&); + + OPENVINO_DEPRECATED("This function is deprecated and will be removed in 2026.0.") static std::vector get_known_types(); /// \brief Checks whether this element type is merge-compatible with `t`. @@ -137,129 +139,131 @@ using TypeVector = std::vector; /// \brief undefined element type /// \ingroup ov_element_cpp_api -constexpr Type undefined(Type_t::undefined); +inline constexpr Type undefined(Type_t::undefined); /// \brief dynamic element type /// \ingroup ov_element_cpp_api -constexpr Type dynamic(Type_t::dynamic); +inline constexpr Type dynamic(Type_t::dynamic); /// \brief boolean element type /// \ingroup ov_element_cpp_api -constexpr Type boolean(Type_t::boolean); +inline constexpr Type boolean(Type_t::boolean); /// \brief bf16 element type /// \ingroup ov_element_cpp_api -constexpr Type bf16(Type_t::bf16); +inline constexpr Type bf16(Type_t::bf16); /// \brief f16 element type /// \ingroup ov_element_cpp_api -constexpr Type f16(Type_t::f16); +inline constexpr Type f16(Type_t::f16); /// \brief f32 element type /// \ingroup ov_element_cpp_api -constexpr Type f32(Type_t::f32); +inline constexpr Type f32(Type_t::f32); /// \brief f64 element type /// \ingroup ov_element_cpp_api -constexpr Type f64(Type_t::f64); +inline constexpr Type f64(Type_t::f64); /// \brief i4 element type /// \ingroup ov_element_cpp_api -constexpr Type i4(Type_t::i4); +inline constexpr Type i4(Type_t::i4); /// \brief i8 element type /// \ingroup ov_element_cpp_api -constexpr Type i8(Type_t::i8); +inline constexpr Type i8(Type_t::i8); /// \brief i16 element type /// \ingroup ov_element_cpp_api -constexpr Type i16(Type_t::i16); +inline constexpr Type i16(Type_t::i16); /// \brief i32 element type /// \ingroup ov_element_cpp_api -constexpr Type i32(Type_t::i32); +inline constexpr Type i32(Type_t::i32); /// \brief i64 element type /// \ingroup ov_element_cpp_api -constexpr Type i64(Type_t::i64); +inline constexpr Type i64(Type_t::i64); /// \brief binary element type /// \ingroup ov_element_cpp_api -constexpr Type u1(Type_t::u1); +inline constexpr Type u1(Type_t::u1); /// \brief u2 element type /// \ingroup ov_element_cpp_api -constexpr Type u2(Type_t::u2); +inline constexpr Type u2(Type_t::u2); /// \brief u3 element type /// \ingroup ov_element_cpp_api -constexpr Type u3(Type_t::u3); +inline constexpr Type u3(Type_t::u3); /// \brief u4 element type /// \ingroup ov_element_cpp_api -constexpr Type u4(Type_t::u4); +inline constexpr Type u4(Type_t::u4); /// \brief u6 element type /// \ingroup ov_element_cpp_api -constexpr Type u6(Type_t::u6); +inline constexpr Type u6(Type_t::u6); /// \brief u8 element type /// \ingroup ov_element_cpp_api -constexpr Type u8(Type_t::u8); +inline constexpr Type u8(Type_t::u8); /// \brief u16 element type /// \ingroup ov_element_cpp_api -constexpr Type u16(Type_t::u16); +inline constexpr Type u16(Type_t::u16); /// \brief u32 element type /// \ingroup ov_element_cpp_api -constexpr Type u32(Type_t::u32); +inline constexpr Type u32(Type_t::u32); /// \brief u64 element type /// \ingroup ov_element_cpp_api -constexpr Type u64(Type_t::u64); +inline constexpr Type u64(Type_t::u64); /// \brief nf4 element type /// \ingroup ov_element_cpp_api -constexpr Type nf4(Type_t::nf4); +inline constexpr Type nf4(Type_t::nf4); /// \brief f8e4m3 element type /// \ingroup ov_element_cpp_api -constexpr Type f8e4m3(Type_t::f8e4m3); +inline constexpr Type f8e4m3(Type_t::f8e4m3); /// \brief f8e4m3 element type /// \ingroup ov_element_cpp_api -constexpr Type f8e5m2(Type_t::f8e5m2); +inline constexpr Type f8e5m2(Type_t::f8e5m2); /// \brief string element type /// \ingroup ov_element_cpp_api -constexpr Type string(Type_t::string); +inline constexpr Type string(Type_t::string); /// \brief f4e2m1 element type /// \ingroup ov_element_cpp_api -constexpr Type f4e2m1(Type_t::f4e2m1); +inline constexpr Type f4e2m1(Type_t::f4e2m1); /// \brief f8e8m0 element type /// \ingroup ov_element_cpp_api -constexpr Type f8e8m0(Type_t::f8e8m0); +inline constexpr Type f8e8m0(Type_t::f8e8m0); -template -Type from() { - OPENVINO_THROW("Unknown type"); +template +constexpr Type from() { + if constexpr (std::is_same_v || std::is_same_v) { + return boolean; + } else if constexpr (std::is_same_v) { + return f16; + } else if constexpr (std::is_same_v) { + return f32; + } else if constexpr (std::is_same_v) { + return f64; + } else if constexpr (std::is_same_v) { + return i8; + } else if constexpr (std::is_same_v) { + return i16; + } else if constexpr (std::is_same_v) { + return i32; + } else if constexpr (std::is_same_v) { + return i64; + } else if constexpr (std::is_same_v) { + return u8; + } else if constexpr (std::is_same_v) { + return u16; + } else if constexpr (std::is_same_v) { + return u32; + } else if constexpr (std::is_same_v) { + return u64; + } else if constexpr (std::is_same_v) { + return bf16; + } else if constexpr (std::is_same_v) { + return f8e4m3; + } else if constexpr (std::is_same_v) { + return f8e5m2; + } else if constexpr (std::is_same_v) { + return string; + } else if constexpr (std::is_same_v) { + return f4e2m1; + } else if constexpr (std::is_same_v) { + return f8e8m0; + } else { + OPENVINO_THROW("Unknown type"); + } } -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); -template <> -OPENVINO_API Type from(); +OPENVINO_DEPRECATED( + "This function is deprecated and will be removed in 2026.0. Use ov::fundamental_type_for instead") OPENVINO_API Type fundamental_type_for(const Type& type); OPENVINO_API @@ -281,12 +285,12 @@ template <> class OPENVINO_API AttributeAdapter : public ValueAccessor { public: OPENVINO_RTTI("AttributeAdapter"); - AttributeAdapter(ov::element::Type& value) : m_ref(value) {} + constexpr AttributeAdapter(ov::element::Type& value) : m_ref(value) {} const std::string& get() override; void set(const std::string& value) override; - operator ov::element::Type&() { + constexpr operator ov::element::Type&() { return m_ref; } diff --git a/src/core/src/type/element_type.cpp b/src/core/src/type/element_type.cpp index 3fdda4d7f55cf8..bd61d1c985ea97 100644 --- a/src/core/src/type/element_type.cpp +++ b/src/core/src/type/element_type.cpp @@ -7,11 +7,21 @@ #include #include #include +#include #include #include "openvino/core/type/element_type_traits.hpp" +#include "openvino/util/common_util.hpp" +namespace ov::element { namespace { +constexpr size_t idx(Type_t e) noexcept { + return static_cast>(e); +} + +// Update it when new type is added +constexpr size_t enum_types_size = idx(f8e8m0) + 1; + struct TypeInfo { size_t m_bitwidth; bool m_is_real; @@ -19,252 +29,179 @@ struct TypeInfo { bool m_is_quantized; const char* m_cname; const char* m_type_name; -}; - -struct ElementTypes { - struct TypeHash { - size_t operator()(ov::element::Type_t t) const { - return static_cast(t); + const char* const* aliases; + size_t alias_count; + + bool has_name(const std::string& type) const { + if (type == m_type_name) { + return true; + } else { + const auto last = aliases + alias_count; + return std::find(aliases, last, type) != last; } - }; - - using ElementsMap = std::unordered_map; -}; + } -inline TypeInfo get_type_info(ov::element::Type_t type) { - switch (type) { - case ov::element::Type_t::undefined: - return {std::numeric_limits::max(), false, false, false, "undefined", "undefined"}; - case ov::element::Type_t::dynamic: - return {0, false, false, false, "dynamic", "dynamic"}; - case ov::element::Type_t::boolean: - return {8, false, true, false, "char", "boolean"}; - case ov::element::Type_t::bf16: - return {16, true, true, false, "bfloat16", "bf16"}; - case ov::element::Type_t::f16: - return {16, true, true, false, "float16", "f16"}; - case ov::element::Type_t::f32: - return {32, true, true, false, "float", "f32"}; - case ov::element::Type_t::f64: - return {64, true, true, false, "double", "f64"}; - case ov::element::Type_t::i4: - return {4, false, true, true, "int4_t", "i4"}; - case ov::element::Type_t::i8: - return {8, false, true, true, "int8_t", "i8"}; - case ov::element::Type_t::i16: - return {16, false, true, false, "int16_t", "i16"}; - case ov::element::Type_t::i32: - return {32, false, true, true, "int32_t", "i32"}; - case ov::element::Type_t::i64: - return {64, false, true, false, "int64_t", "i64"}; - case ov::element::Type_t::u1: - return {1, false, false, false, "uint1_t", "u1"}; - case ov::element::Type_t::u2: - return {2, false, false, false, "uint2_t", "u2"}; - case ov::element::Type_t::u3: - return {3, false, false, false, "uint3_t", "u3"}; - case ov::element::Type_t::u4: - return {4, false, false, false, "uint4_t", "u4"}; - case ov::element::Type_t::u6: - return {6, false, false, false, "uint6_t", "u6"}; - case ov::element::Type_t::u8: - return {8, false, false, true, "uint8_t", "u8"}; - case ov::element::Type_t::u16: - return {16, false, false, false, "uint16_t", "u16"}; - case ov::element::Type_t::u32: - return {32, false, false, false, "uint32_t", "u32"}; - case ov::element::Type_t::u64: - return {64, false, false, false, "uint64_t", "u64"}; - case ov::element::Type_t::nf4: - return {4, false, false, true, "nfloat4", "nf4"}; - case ov::element::Type_t::f8e4m3: - return {8, true, true, true, "f8e4m3", "f8e4m3"}; - case ov::element::Type_t::f8e5m2: - return {8, true, true, true, "f8e5m2", "f8e5m2"}; - case ov::element::Type_t::string: - return {8 * sizeof(std::string), false, false, false, "string", "string"}; - case ov::element::Type_t::f4e2m1: - return {4, true, true, true, "f4e2m1", "f4e2m1"}; - case ov::element::Type_t::f8e8m0: - return {8, true, true, true, "f8e8m0", "f8e8m0"}; - default: - OPENVINO_THROW("ov::element::Type_t not supported: ", type); + constexpr bool is_valid() const { + return m_cname != nullptr && m_type_name != nullptr; } }; +; + +constexpr TypeInfo type_info(size_t bitwidth, + bool is_real, + bool is_signed, + bool is_quantized, + const char* cname, + const char* type_name) { + return {bitwidth, is_real, is_signed, is_quantized, cname, type_name, nullptr, 0}; +} + +template +constexpr TypeInfo type_info(size_t bitwidth, + bool is_real, + bool is_signed, + bool is_quantized, + const char* cname, + const char* type_name, + const Array& aliases) { + return {bitwidth, is_real, is_signed, is_quantized, cname, type_name, aliases.data(), aliases.size()}; +} + +constexpr auto undefined_aliases = util::make_array("UNSPECIFIED"); +constexpr auto boolean_aliases = util::make_array("BOOL"); +constexpr auto bf16_aliases = util::make_array("BF16"); +constexpr auto f16_aliases = util::make_array("FP16"); +constexpr auto f32_aliases = util::make_array("FP32"); +constexpr auto f64_aliases = util::make_array("FP64"); +constexpr auto i4_aliases = util::make_array("I4"); +constexpr auto i8_aliases = util::make_array("I8"); +constexpr auto i16_aliases = util::make_array("I16"); +constexpr auto i32_aliases = util::make_array("I32"); +constexpr auto i64_aliases = util::make_array("I64"); +constexpr auto u1_aliases = util::make_array("U1", "bin", "BIN"); +constexpr auto u2_aliases = util::make_array("U2"); +constexpr auto u3_aliases = util::make_array("U3"); +constexpr auto u4_aliases = util::make_array("U4"); +constexpr auto u6_aliases = util::make_array("U6"); +constexpr auto u8_aliases = util::make_array("U8"); +constexpr auto u16_aliases = util::make_array("U16"); +constexpr auto u32_aliases = util::make_array("U32"); +constexpr auto u64_aliases = util::make_array("U64"); +constexpr auto nf4_aliases = util::make_array("NF4"); +constexpr auto f8e4m3_aliases = util::make_array("F8E4M3"); +constexpr auto f8e5m2_aliases = util::make_array("F8E5M2"); +constexpr auto string_aliases = util::make_array("STRING"); +constexpr auto f4e2m1_aliases = util::make_array("F4E2M1"); +constexpr auto f8e8m0_aliases = util::make_array("F8E8M0"); + +static constexpr std::array types_info = { + type_info(std::numeric_limits::max(), + false, + false, + false, + "undefined", + "undefined", + undefined_aliases), // undefined + type_info(0, false, false, false, "dynamic", "dynamic"), // dynamic + type_info(8, false, true, false, "char", "boolean", boolean_aliases), // boolean + type_info(16, true, true, false, "bfloat16", "bf16", bf16_aliases), // bf16 + type_info(16, true, true, false, "float16", "f16", f16_aliases), // f16 + type_info(32, true, true, false, "float", "f32", f32_aliases), // f32 + type_info(64, true, true, false, "double", "f64", f64_aliases), // f64 + type_info(4, false, true, true, "int4_t", "i4", i4_aliases), // i4 + type_info(8, false, true, true, "int8_t", "i8", i8_aliases), // i8 + type_info(16, false, true, false, "int16_t", "i16", i16_aliases), // i16 + type_info(32, false, true, true, "int32_t", "i32", i32_aliases), // i32 + type_info(64, false, true, false, "int64_t", "i64", i64_aliases), // i64 + type_info(1, false, false, false, "uint1_t", "u1", u1_aliases), // u1 + type_info(2, false, false, false, "uint2_t", "u2", u2_aliases), // u2 + type_info(3, false, false, false, "uint3_t", "u3", u3_aliases), // u3 + type_info(4, false, false, false, "uint4_t", "u4", u4_aliases), // u4 + type_info(6, false, false, false, "uint6_t", "u6", u6_aliases), // u6 + type_info(8, false, false, true, "uint8_t", "u8", u8_aliases), // u8 + type_info(16, false, false, false, "uint16_t", "u16", u16_aliases), // u16 + type_info(32, false, false, false, "uint32_t", "u32", u32_aliases), // u32 + type_info(64, false, false, false, "uint64_t", "u64", u64_aliases), // u64 + type_info(4, false, false, true, "nfloat4", "nf4", nf4_aliases), // nf4 + type_info(8, true, true, true, "f8e4m3", "f8e4m3", f8e4m3_aliases), // f8e4m3 + type_info(8, true, true, true, "f8e5m2", "f8e5m2", f8e5m2_aliases), // f8e5m2 + type_info(8 * sizeof(std::string), false, false, false, "string", "string", string_aliases), // string + type_info(4, true, true, true, "f4e2m1", "f4e2m1", f4e2m1_aliases), // f4e2m1 + type_info(8, true, true, true, "f8e8m0", "f8e8m0", f8e8m0_aliases) // f8e8m0 +}; -ov::element::Type type_from_string(const std::string& type) { - if (type == "f16" || type == "FP16") { - return ::ov::element::Type(::ov::element::Type_t::f16); - } else if (type == "f32" || type == "FP32") { - return ::ov::element::Type(::ov::element::Type_t::f32); - } else if (type == "bf16" || type == "BF16") { - return ::ov::element::Type(::ov::element::Type_t::bf16); - } else if (type == "f64" || type == "FP64") { - return ::ov::element::Type(::ov::element::Type_t::f64); - } else if (type == "i4" || type == "I4") { - return ::ov::element::Type(::ov::element::Type_t::i4); - } else if (type == "i8" || type == "I8") { - return ::ov::element::Type(::ov::element::Type_t::i8); - } else if (type == "i16" || type == "I16") { - return ::ov::element::Type(::ov::element::Type_t::i16); - } else if (type == "i32" || type == "I32") { - return ::ov::element::Type(::ov::element::Type_t::i32); - } else if (type == "i64" || type == "I64") { - return ::ov::element::Type(::ov::element::Type_t::i64); - } else if (type == "u1" || type == "U1" || type == "BIN" || type == "bin") { - return ::ov::element::Type(::ov::element::Type_t::u1); - } else if (type == "u2" || type == "U2") { - return ::ov::element::Type(::ov::element::Type_t::u2); - } else if (type == "u3" || type == "U3") { - return ::ov::element::Type(::ov::element::Type_t::u3); - } else if (type == "u4" || type == "U4") { - return ::ov::element::Type(::ov::element::Type_t::u4); - } else if (type == "u6" || type == "U6") { - return ::ov::element::Type(::ov::element::Type_t::u6); - } else if (type == "u8" || type == "U8") { - return ::ov::element::Type(::ov::element::Type_t::u8); - } else if (type == "u16" || type == "U16") { - return ::ov::element::Type(::ov::element::Type_t::u16); - } else if (type == "u32" || type == "U32") { - return ::ov::element::Type(::ov::element::Type_t::u32); - } else if (type == "u64" || type == "U64") { - return ::ov::element::Type(::ov::element::Type_t::u64); - } else if (type == "boolean" || type == "BOOL") { - return ::ov::element::Type(::ov::element::Type_t::boolean); - } else if (type == "string" || type == "STRING") { - return ::ov::element::Type(::ov::element::Type_t::string); - } else if (type == "undefined" || type == "UNSPECIFIED") { - return ::ov::element::Type(::ov::element::Type_t::undefined); - } else if (type == "dynamic") { - return ::ov::element::Type(::ov::element::Type_t::dynamic); - } else if (type == "nf4" || type == "NF4") { - return ::ov::element::Type(::ov::element::Type_t::nf4); - } else if (type == "f8e4m3" || type == "F8E4M3") { - return ::ov::element::Type(::ov::element::Type_t::f8e4m3); - } else if (type == "f8e5m2" || type == "F8E5M2") { - return ::ov::element::Type(::ov::element::Type_t::f8e5m2); - } else if (type == "f4e2m1" || type == "F4E2M1") { - return ::ov::element::Type(::ov::element::Type_t::f4e2m1); - } else if (type == "f8e8m0" || type == "F8E8M0") { - return ::ov::element::Type(::ov::element::Type_t::f8e8m0); - } else { - OPENVINO_THROW("Incorrect type: ", type); +constexpr bool validate_types_info(decltype(types_info)& info, size_t i = 0) { + return i >= info.size() ? true : info[i].is_valid() ? validate_types_info(info, i + 1) : false; +} + +static_assert(validate_types_info(types_info), "Some entries of type_info are invalid."); + +constexpr bool is_valid_type_idx(size_t idx) { + return idx < types_info.size(); +} + +size_t type_idx_for(const std::string& type_name) { + size_t type_idx = 0; + for (; is_valid_type_idx(type_idx); ++type_idx) { + if (types_info[type_idx].has_name(type_name)) { + break; + } } + return type_idx; } + +const TypeInfo& get_type_info(Type_t type) { + const auto type_idx = idx(type); + OPENVINO_ASSERT(is_valid_type_idx(type_idx), "Type_t not supported: ", type_idx); + return types_info[type_idx]; +} + +Type type_from_string(const std::string& type) { + const auto type_idx = type_idx_for(type); + OPENVINO_ASSERT(is_valid_type_idx(type_idx), "Unsupported element type: ", type); + return {static_cast(type_idx)}; +} + +// generate known types automatically +static constexpr auto known_types = [] { + std::array types; + for (size_t idx = 1, i = 0; i < types.size(); ++idx, ++i) { + types[i] = Type{static_cast(idx)}; + } + return types; +}(); } // namespace -std::vector ov::element::Type::get_known_types() { - std::vector rc = { - &ov::element::dynamic, &ov::element::boolean, &ov::element::bf16, &ov::element::f16, &ov::element::f32, - &ov::element::f64, &ov::element::i4, &ov::element::i8, &ov::element::i16, &ov::element::i32, - &ov::element::i64, &ov::element::u1, &ov::element::u2, &ov::element::u3, &ov::element::u4, - &ov::element::u6, &ov::element::u8, &ov::element::u16, &ov::element::u32, &ov::element::u64, - &ov::element::nf4, &ov::element::f8e4m3, &ov::element::f8e5m2, &ov::element::string, &ov::element::f4e2m1, - &ov::element::f8e8m0}; - return rc; +std::vector Type::get_known_types() { + std::vector result(known_types.size()); + for (size_t i = 0; i < known_types.size(); ++i) { + result[i] = &known_types[i]; + } + return result; } -ov::element::Type::Type(const std::string& type) : Type(type_from_string(type)) {} +Type::Type(const std::string& type) : Type(type_from_string(type)) {} -std::string ov::element::Type::c_type_string() const { +std::string Type::c_type_string() const { return get_type_info(m_type).m_cname; } -size_t ov::element::Type::size() const { +size_t Type::size() const { return (bitwidth() + 7) >> 3; } -size_t ov::element::Type::hash() const { +size_t Type::hash() const { return static_cast(m_type); } -std::string ov::element::Type::get_type_name() const { +std::string Type::get_type_name() const { return to_string(); } -std::string ov::element::Type::to_string() const { +std::string Type::to_string() const { return get_type_info(m_type).m_type_name; } -namespace ov { -namespace element { -template <> -Type from() { - return Type_t::boolean; -} -template <> -Type from() { - return Type_t::boolean; -} -template <> -Type from() { - return Type_t::f16; -} -template <> -Type from() { - return Type_t::f32; -} -template <> -Type from() { - return Type_t::f64; -} -template <> -Type from() { - return Type_t::i8; -} -template <> -Type from() { - return Type_t::i16; -} -template <> -Type from() { - return Type_t::i32; -} -template <> -Type from() { - return Type_t::i64; -} -template <> -Type from() { - return Type_t::u8; -} -template <> -Type from() { - return Type_t::u16; -} -template <> -Type from() { - return Type_t::u32; -} -template <> -Type from() { - return Type_t::u64; -} -template <> -Type from() { - return Type_t::bf16; -} -template <> -Type from() { - return Type_t::f8e4m3; -} -template <> -Type from() { - return Type_t::f8e5m2; -} -template <> -Type from() { - return Type_t::string; -} -template <> -Type from() { - return Type_t::f4e2m1; -} -template <> -Type from() { - return Type_t::f8e8m0; -} - Type fundamental_type_for(const Type& type) { switch (type) { case Type_t::boolean: @@ -322,44 +259,24 @@ Type fundamental_type_for(const Type& type) { } } -} // namespace element -} // namespace ov - -std::ostream& ov::element::operator<<(std::ostream& out, const ov::element::Type& obj) { +std::ostream& operator<<(std::ostream& out, const Type& obj) { return out << obj.to_string(); } -std::istream& ov::element::operator>>(std::istream& in, ov::element::Type& obj) { - const std::unordered_map legacy = { - {"BOOL", ov::element::boolean}, {"BF16", ov::element::bf16}, {"I4", ov::element::i4}, - {"I8", ov::element::i8}, {"I16", ov::element::i16}, {"I32", ov::element::i32}, - {"I64", ov::element::i64}, {"U4", ov::element::u4}, {"U8", ov::element::u8}, - {"U16", ov::element::u16}, {"U32", ov::element::u32}, {"U64", ov::element::u64}, - {"FP32", ov::element::f32}, {"FP64", ov::element::f64}, {"FP16", ov::element::f16}, - {"BIN", ov::element::u1}, {"NF4", ov::element::nf4}, {"F8E4M3", ov::element::f8e4m3}, - {"F8E5M2", ov::element::f8e5m2}, {"STRING", ov::element::string}, {"F4E2M1", ov::element::f4e2m1}, - {"F8E8M0", ov::element::f8e8m0}}; +std::istream& operator>>(std::istream& in, Type& obj) { std::string str; in >> str; - auto it_legacy = legacy.find(str); - if (it_legacy != legacy.end()) { - obj = it_legacy->second; - return in; - } - for (auto&& type : Type::get_known_types()) { - if (type->to_string() == str) { - obj = *type; - break; - } + if (const auto type_idx = type_idx_for(str); is_valid_type_idx(type_idx)) { + obj = {static_cast(type_idx)}; } return in; } -bool ov::element::Type::compatible(const ov::element::Type& t) const { +bool Type::compatible(const Type& t) const { return (is_dynamic() || t.is_dynamic() || *this == t); } -bool ov::element::Type::merge(ov::element::Type& dst, const ov::element::Type& t1, const ov::element::Type& t2) { +bool Type::merge(Type& dst, const Type& t1, const Type& t2) { if (t1.is_dynamic()) { dst = t2; return true; @@ -374,69 +291,30 @@ bool ov::element::Type::merge(ov::element::Type& dst, const ov::element::Type& t } } -bool ov::element::Type::is_static() const { +bool Type::is_static() const { return get_type_info(m_type).m_bitwidth != 0; } -bool ov::element::Type::is_real() const { +bool Type::is_real() const { return get_type_info(m_type).m_is_real; } -bool ov::element::Type::is_integral_number() const { - return is_integral() && (m_type != ov::element::boolean); +bool Type::is_integral_number() const { + return is_integral() && (m_type != boolean); } -bool ov::element::Type::is_signed() const { +bool Type::is_signed() const { return get_type_info(m_type).m_is_signed; } -bool ov::element::Type::is_quantized() const { +bool Type::is_quantized() const { return get_type_info(m_type).m_is_quantized; } -size_t ov::element::Type::bitwidth() const { +size_t Type::bitwidth() const { return get_type_info(m_type).m_bitwidth; } - -inline size_t compiler_byte_size(ov::element::Type_t et) { - switch (et) { -#define ET_CASE(et) \ - case ov::element::Type_t::et: \ - return sizeof(ov::element_type_traits::value_type); - ET_CASE(boolean); - ET_CASE(bf16); - ET_CASE(f16); - ET_CASE(f32); - ET_CASE(f64); - ET_CASE(i4); - ET_CASE(i8); - ET_CASE(i16); - ET_CASE(i32); - ET_CASE(i64); - ET_CASE(u1); - ET_CASE(u2); - ET_CASE(u3); - ET_CASE(u4); - ET_CASE(u6); - ET_CASE(u8); - ET_CASE(u16); - ET_CASE(u32); - ET_CASE(u64); - ET_CASE(nf4); - ET_CASE(f8e4m3); - ET_CASE(f8e5m2); - ET_CASE(string); - ET_CASE(f4e2m1); - ET_CASE(f8e8m0); -#undef ET_CASE - case ov::element::Type_t::undefined: - return 0; - case ov::element::Type_t::dynamic: - return 0; - } - - OPENVINO_THROW("compiler_byte_size: Unsupported value of ov::element::Type_t: ", static_cast(et)); -} +} // namespace ov::element namespace ov { template <> diff --git a/src/frontends/common/include/openvino/frontend/extension/conversion.hpp b/src/frontends/common/include/openvino/frontend/extension/conversion.hpp index ee861b46033988..46f8ff1e793072 100644 --- a/src/frontends/common/include/openvino/frontend/extension/conversion.hpp +++ b/src/frontends/common/include/openvino/frontend/extension/conversion.hpp @@ -20,7 +20,7 @@ class FRONTEND_API ConversionExtensionBase : public ov::Extension { return m_op_type; } - ~ConversionExtensionBase() override = default; + ~ConversionExtensionBase() override; private: std::string m_op_type; diff --git a/src/frontends/common/src/extension/conversion.cpp b/src/frontends/common/src/extension/conversion.cpp index ff90c163604f8c..febd3868c2798b 100644 --- a/src/frontends/common/src/extension/conversion.cpp +++ b/src/frontends/common/src/extension/conversion.cpp @@ -6,4 +6,6 @@ using namespace ov::frontend; +ConversionExtensionBase::~ConversionExtensionBase() = default; + ConversionExtension::~ConversionExtension() = default; diff --git a/src/frontends/tensorflow_common/src/op/select.cpp b/src/frontends/tensorflow_common/src/op/select.cpp index f19e01f5a021e6..35c7e893e542e1 100644 --- a/src/frontends/tensorflow_common/src/op/select.cpp +++ b/src/frontends/tensorflow_common/src/op/select.cpp @@ -13,6 +13,7 @@ #include "openvino/op/shape_of.hpp" #include "openvino/op/squeeze.hpp" #include "openvino/op/subtract.hpp" +#include "openvino/op/unsqueeze.hpp" using namespace std; using namespace ov; @@ -31,7 +32,19 @@ OutputVector translate_select_base_op(const NodeContext& node, set_node_name(node.get_name(), select); return {select}; } - +bool has_complex_inputs(Output& x, Output& y, element::Type& complex_part_type) { + auto complex_type_mark_x = as_type_ptr(x.get_node_shared_ptr()); + auto complex_type_mark_y = as_type_ptr(y.get_node_shared_ptr()); + if (complex_type_mark_x) { + x = complex_type_mark_x->input_value(0); + complex_part_type = complex_type_mark_x->get_complex_part_type(); + } + if (complex_type_mark_y) { + y = complex_type_mark_y->input_value(0); + complex_part_type = complex_type_mark_y->get_complex_part_type(); + } + return (complex_type_mark_x || complex_type_mark_y); +} OutputVector translate_select_v2_op(const NodeContext& node) { // according to the TensorFlow documentation. See in the code: // https://github.com/tensorflow/tensorflow/blob/v2.4.1/tensorflow/lite/kernels/select.cc#L188-L211 @@ -40,10 +53,23 @@ OutputVector translate_select_v2_op(const NodeContext& node) { // is true or the value of 'y' if false. There are valid condition input sizes: // 1. Either the same shape (in which case the select is elementwise), or // 2. Broadcastable shapes between 'condition', 'x' and 'y'. - default_op_checks(node, 3, {"SelectV2", "SELECT_V2"}); - // no preparation for inputs are needed - // inputs are already NumPy broadcastable - return translate_select_base_op(node, node.get_input(0), node.get_input(1), node.get_input(2)); + default_op_checks(node, 3, {"SelectV2", "SELECT_V2"}, true); + auto condition = node.get_input(0); + auto x = node.get_input(1); + auto y = node.get_input(2); + + element::Type complex_part_type; + auto is_complex = has_complex_inputs(x, y, complex_part_type); + + if (is_complex) { + auto const_negative_one = make_shared(element::i32, Shape{1}, -1); + auto new_condition = make_shared(condition, const_negative_one); + auto result = translate_select_base_op(node, new_condition, x, y); + auto complex_result = make_shared(result[0].get_node_shared_ptr(), complex_part_type); + return {complex_result->output(0)}; + } else { + return translate_select_base_op(node, condition, x, y); + } } OutputVector translate_select_op(const NodeContext& node) { @@ -59,21 +85,9 @@ OutputVector translate_select_op(const NodeContext& node) { auto condition = node.get_input(0); auto x = node.get_input(1); auto y = node.get_input(2); - auto complex_type_mark_x = as_type_ptr(x.get_node_shared_ptr()); - auto complex_type_mark_y = as_type_ptr(y.get_node_shared_ptr()); - auto is_complex = (complex_type_mark_x || complex_type_mark_y); element::Type complex_part_type; - - if (complex_type_mark_x) { - x = complex_type_mark_x->input_value(0); - complex_part_type = complex_type_mark_x->get_complex_part_type(); - } - - if (complex_type_mark_y) { - y = complex_type_mark_y->input_value(0); - complex_part_type = complex_type_mark_y->get_complex_part_type(); - } + auto is_complex = has_complex_inputs(x, y, complex_part_type); // compute number of dimensions to unsqueeze the condition auto cond_rank = compute_subgraph_scalar_rank(condition, element::i32); @@ -85,14 +99,13 @@ OutputVector translate_select_op(const NodeContext& node) { auto new_subshape = make_shared(const_one, num_new_axes); auto cond_shape = make_shared(condition, element::i32); // use extra dimensions in the begin to avoid concatenation of empty tensors that is not supported by Concat - auto const_1 = make_shared(element::i32, Shape{1}, 1); - auto new_cond_shape = make_shared(OutputVector{const_1, cond_shape, new_subshape}, 0); + auto new_cond_shape = make_shared(OutputVector{const_one, cond_shape, new_subshape}, 0); // prepare the condition to have the same rank as operands `x` and `y` auto prep_cond = make_shared(condition, new_cond_shape, false)->output(0); // squeeze prep_cond by one extra dimension specially added - auto const_0 = make_shared(element::i32, Shape{1}, 0); - prep_cond = make_shared(prep_cond, const_0); + auto const_zero = make_shared(element::i32, Shape{1}, 0); + prep_cond = make_shared(prep_cond, const_zero); auto result = translate_select_base_op(node, prep_cond, x, y); if (is_complex) { diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index 0cad1840e5d1a8..7e2a0a8b4be441 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -38,18 +38,6 @@ ov::ICore::~ICore() = default; -namespace ov { -namespace util { -template -constexpr std::array< - typename std::conditional::value, typename std::common_type::type, T>::type, - sizeof...(Args)> -make_array(Args&&... args) { - return {std::forward(args)...}; -} -} // namespace util -} // namespace ov - namespace { #ifdef PROXY_PLUGIN_ENABLED diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp index 54d4ffaa433944..0cb074da6d426f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp @@ -253,7 +253,7 @@ void BrgemmCopyBKernel::emit_brgemm_copy_b_kernel_call(size_t N, size_t offset_out, size_t offset_comp) { EmitABIRegSpills spill(this); - spill.preamble(); + spill.preamble(get_live_regs()); const auto add_offset = [&](Xbyak::Reg64 reg, size_t bytes_offset) { if (bytes_offset) { @@ -298,6 +298,16 @@ void BrgemmCopyBKernel::emit_brgemm_copy_b_kernel_call(size_t N, spill.postamble(); } +std::set BrgemmCopyBKernel::get_live_regs() const { + // Only the registers `src_reg`, `tr_src_reg` and `comp_reg` should be + // saved on each `jit_brgemm_matmul_copy_b_t` binary call. + // They're ABI parameter registers (caller saved). So we have to manually + // spills only them on each `jit_brgemm_matmul_copy_b_t` binary call + return {{snippets::RegType::gpr, static_cast(src_reg.getIdx())}, + {snippets::RegType::gpr, static_cast(tr_src_reg.getIdx())}, + {snippets::RegType::gpr, static_cast(comp_reg.getIdx())}}; +} + void BrgemmCopyBKernel::execute(matmul::jit_brgemm_matmul_copy_b_t* kernel, const void* src, const void* dst, diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp index 5ef740067f2035..594702fed4ae3c 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp @@ -170,6 +170,8 @@ struct BrgemmCopyBKernel : public RepackedInputKernel, public dnnl::impl::cpu::x void init_brgemm_copy_b_kernel(std::unique_ptr& kernel, const BrgemmCopyBKernelConfig& conf) const; + std::set get_live_regs() const; + static constexpr auto abi_param_regs = dnnl::impl::cpu::x64::abi_param_regs; const Xbyak::Reg64 src_reg = abi_param2; const Xbyak::Reg64 tr_src_reg = abi_param3; diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index e50f8f413915a5..54e4e2e6fd5a8b 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -150,9 +150,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene data_type::f32); } - if (mayiuse(avx512_core)) { - uni_vcvtneps2bf16 = std::make_shared(this, isa); - } + uni_vcvtneps2bf16 = std::make_shared(this, isa); this->preamble(); @@ -188,9 +186,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene this->postamble(); - if (mayiuse(avx512_core)) { - uni_vcvtneps2bf16->emit_data(); - } + uni_vcvtneps2bf16->emit_data(); if (jcp_.reduce_mode == Algorithm::ReduceAnd || jcp_.reduce_mode == Algorithm::ReduceL1 || jcp_.reduce_mode == Algorithm::ReduceMax || jcp_.reduce_mode == Algorithm::ReduceMin || @@ -1017,9 +1013,15 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene uni_vmovups(op, vmm_dst); break; case memory::data_type::bf16: - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, - {static_cast(ymm_dst.getIdx())}); - vmovdqu16(op, ymm_dst); + if (isa == cpu::x64::avx512_core) { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(ymm_dst.getIdx())}); + vmovdqu16(op, ymm_dst); + } else { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(xmm_dst.getIdx())}); + uni_vmovdqu(op, xmm_dst); + } break; case memory::data_type::f16: vcvtps2ph(op, vmm_dst, 0x4); @@ -1253,9 +1255,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi data_type::f32); } - if (mayiuse(avx512_core)) { - uni_vcvtneps2bf16 = std::make_shared(this, isa); - } + uni_vcvtneps2bf16 = std::make_shared(this, isa); this->preamble(); @@ -1312,9 +1312,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi this->postamble(); - if (mayiuse(avx512_core)) { - uni_vcvtneps2bf16->emit_data(); - } + uni_vcvtneps2bf16->emit_data(); if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) { log_injector->prepare_table(); @@ -1770,9 +1768,15 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi uni_vmovups(op, vmm_dst); break; case memory::data_type::bf16: - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, - {static_cast(ymm_dst.getIdx())}); - vmovdqu16(op, ymm_dst); + if (isa == cpu::x64::avx512_core) { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(ymm_dst.getIdx())}); + vmovdqu16(op, ymm_dst); + } else { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(xmm_dst.getIdx())}); + uni_vmovdqu(op, xmm_dst); + } break; case memory::data_type::f16: vcvtps2ph(op, vmm_dst, 0x4); diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/utils/ranges.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/utils/ranges.hpp index 3805fde5ce9bfb..a383fc2b7df220 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/utils/ranges.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/utils/ranges.hpp @@ -34,16 +34,16 @@ struct Range { max_known_port = std::max(static_cast(max_known_port), 1); for (size_t port = 0; port < max_known_port; port++) { std::map type_map; - for (auto& type : ov::element::Type::get_known_types()) { - ov::test::utils::InputGenerateData new_range = rangeByType.get_range(*type); - if (type->is_real() && port < real_port_ranges.size()) { + for (const auto& type : get_known_types()) { + ov::test::utils::InputGenerateData new_range = rangeByType.get_range(type); + if (type.is_real() && port < real_port_ranges.size()) { new_range.correct_range(real_port_ranges.at(port)); new_range.input_attribute = real_port_ranges.at(port).input_attribute; - } else if (type->is_integral() && port < int_port_ranges.size()) { + } else if (type.is_integral() && port < int_port_ranges.size()) { new_range.correct_range(int_port_ranges.at(port)); new_range.input_attribute = int_port_ranges.at(port).input_attribute; } - type_map[*type] = new_range; + type_map[type] = new_range; } data.push_back(type_map); } diff --git a/src/tests/test_utils/common_test_utils/include/common_test_utils/type_ranges.hpp b/src/tests/test_utils/common_test_utils/include/common_test_utils/type_ranges.hpp index c84b58066387f3..7dc5841869a493 100644 --- a/src/tests/test_utils/common_test_utils/include/common_test_utils/type_ranges.hpp +++ b/src/tests/test_utils/common_test_utils/include/common_test_utils/type_ranges.hpp @@ -15,6 +15,21 @@ namespace ov { namespace test { namespace utils { +static const std::vector& get_known_types() { + static const auto known_types = [] { + using namespace ov::element; + constexpr size_t enum_count = static_cast>(Type_t::f8e8m0) - 1; + + std::vector types(enum_count); + for (size_t idx = 1, i = 0; i < types.size(); ++idx, ++i) { + types[i] = Type{static_cast(idx)}; + } + return types; + }(); + + return known_types; +} + static ov::test::utils::InputGenerateData get_range_by_type( ov::element::Type elemType, uint32_t max_range_limit = testing::internal::Random::kMaxRange) { @@ -110,8 +125,8 @@ struct RangeByType { std::map data; RangeByType() { - for (auto& type : ov::element::Type::get_known_types()) { - data[*type] = get_range_by_type(*type); + for (const auto& type : get_known_types()) { + data[type] = get_range_by_type(type); } } diff --git a/tests/layer_tests/tensorflow_tests/test_tf_SelectV2.py b/tests/layer_tests/tensorflow_tests/test_tf_SelectV2.py index 058f2e21a4a60b..d199275bf34345 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_SelectV2.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_SelectV2.py @@ -51,3 +51,52 @@ def test_select_v2_basic(self, params, ie_device, precision, ir_version, temp_di self._test(*self.create_select_v2_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) + + +class TestComplexSelectV2(CommonTFLayerTest): + def _prepare_input(self, inputs_info): + rng = np.random.default_rng() + assert 'cond:0' in inputs_info, "Test error: inputs_info must contain `cond`" + assert 'x_real:0' in inputs_info, "Test error: inputs_info must contain `x_real`" + assert 'x_imag:0' in inputs_info, "Test error: inputs_info must contain `x_imag`" + assert 'y_real:0' in inputs_info, "Test error: inputs_info must contain `y_real`" + assert 'y_imag:0' in inputs_info, "Test error: inputs_info must contain `y_imag`" + cond_shape = inputs_info['cond:0'] + inputs_data = {} + inputs_data['cond:0'] = np.random.randint(0, 2, cond_shape).astype(bool) + for part in ['x_real:0', 'x_imag:0', 'y_real:0', 'y_imag:0']: + inputs_data[part] = 4 * rng.random(inputs_info[part]).astype(np.float32) - 2 + return inputs_data + + def create_complex_select_v2_net(self, cond_shape, x_shape, y_shape): + tf.compat.v1.reset_default_graph() + # Create the graph and model + with tf.compat.v1.Session() as sess: + cond = tf.compat.v1.placeholder(tf.bool, cond_shape, 'cond') + x_real = tf.compat.v1.placeholder(tf.float32, x_shape, 'x_real') + x_imag = tf.compat.v1.placeholder(tf.float32, x_shape, 'x_imag') + y_real = tf.compat.v1.placeholder(tf.float32, y_shape, 'y_real') + y_imag = tf.compat.v1.placeholder(tf.float32, y_shape, 'y_imag') + complex_x = tf.raw_ops.Complex(real=x_real, imag=x_imag) + complex_y = tf.raw_ops.Complex(real=y_real, imag=y_imag) + complex_select = tf.raw_ops.SelectV2(condition=cond, t=complex_x, e=complex_y) + tf.raw_ops.Real(input=complex_select) + tf.raw_ops.Imag(input=complex_select) + tf.compat.v1.global_variables_initializer() + tf_net = sess.graph_def + return tf_net, None + + test_data_basic = [ + dict(cond_shape=[3, 1], x_shape=[3, 1], y_shape=[3, 1]), + dict(cond_shape=[], x_shape=[2], y_shape=[3, 2]), + dict(cond_shape=[4], x_shape=[3, 2, 1], y_shape=[2, 4]), + ] + + @pytest.mark.parametrize("params", test_data_basic) + @pytest.mark.precommit + @pytest.mark.nightly + def test_complex_select_v2(self, params, ie_device, precision, ir_version, temp_dir, + use_legacy_frontend): + self._test(*self.create_complex_select_v2_net(**params), + ie_device, precision, ir_version, temp_dir=temp_dir, + use_legacy_frontend=use_legacy_frontend) \ No newline at end of file