playback_giza_model.py

# Playback █████

```
█████████████████████████████████████████████████████████████████████████████████████████████
██████████████████████████▓░░░░░░░░░░░░░░░░░░▒▒▒░░░░░░░░░░░░░▒███████████████████████████████
██████████████████████████▓░░░░░░░░░░░░░░░░░░▒▓▓▒░░░░░░░░░░░░▒███████████████████████████████
██████████████████████████▓░░░░░░░░░░░░░░░░░░▒▓▓▓▒░░░░░░░░░░░▒███████████████████████████████
██████████████████████████▓░░░░░░░░░░░░░░░░░░▒▓▓▓▓░░░░░░░░░░░▒███████████████████████████████
██████████████████████████▓░░░░░░░░░░░░░░░░░░▒██▓▓░░░░░░░░░░░▒███████████████████████████████
██████████████████████████▓▒▒▒▒▒▒▒▒▓▓▓▒▒▒▒▒▒▒▓█▓▓▓▒▒▓▒▒▒▒▒▒▒▒▒███████████████████████████████
██████████████████████████▓▒▒▒▒▒▒██████▓▒▒▒▒▒▒█▓██████▓▓▒▒▒▒▒▒███████████████████████████████
██████████████████████████▓░░░░▒█▓▓████▓▓▒░░░▒▓███▓▓▓▓▓▓▓▓▒░░▒███████████████████████████████
██████████████████████████▓░░░▒▓█▓▓▓▓▓▓▓▓▓▒░░▒▓▓▓▓▓▓▓▓▓▓▓▓▓▒░▒███████████████████████████████
██████████████████████████▓░░▒▓█▓██▓▓█████▓▒░▒██▓▓██▓█▓███▓▒░▒███████████████████████████████
██████████████████████████▓░▒▓███▓▒▒▒▒▓▓██▓▓░▒█▓▓▓▒▒▒▒▒▓███▓▒▒███████████████████████████████
██████████████████████████▓▒▓████▒▒▒▒▒▒▓█▓█▓▓▓▓▓█▒▒▒▒▒▒▒██▓▓▓▒███████████████████████████████
██████████████████████████▓▒▓█▓▓▓▒▒▒▒▒▒▒▒▓▓███▓▒▒▒▒▒▒▒▒▒███▓▓▒███████████████████████████████
██████████████████████████▓░▓▓█▓▓░░░░░░░░▓▓█▓▓▓▒░░░░░░░▒█▓▓▓▓▒███████████████████████████████
██████████████████████████▓░▒▓▓▓▒░░░░░░░░░▒▓▓▓▓▓▒░░░░░░▒▓▓▓▓▓▒███████████████████████████████
██████████████████████████▓░▒▓▓▓▓▒░░░░░▒▓▓░▓▓▓▓▓▒▒░░░░░▓▓▓▓▓▓▒███████████████████████████████
██████████████████████████▓░▓▓█▓▓▓▒░░░▒▓██▓▒▒████▓▒░░░░███▓▓▒▒███████████████████████████████
██████████████████████████▓░▓▓▓▓▓▓▓▓▓▓▓████▓▒▓▓▓▓▓▓▓▓▓▓▓▓▓▓▒▒▒███████████████████████████████
██████████████████████████▓▒▓█▓▓▓▓▓█████▓█▓▒▒▒▓███████████▓▒▒▒███████████████████████████████
██████████████████████████▓░▓▓▓███████▓▓█▓▒░░░▒▓▓▓▓▓▓▓▓▓▓▓▒░░▒███████████████████████████████
██████████████████████████▓░▓▓▓▓▓▓▓▓▓▓▓▓▓▒░░░░░░▒▓▓▓▓▓▓▒▒░░░░▒███████████████████████████████
██████████████████████████▓░▓▓▓▓▓▒▒▒▒▒▒▒░░░░░░░░░░▒▒▒▒▒░░░░░░▒███████████████████████████████
██████████████████████████▓░▓███▓░░░░░░░░░░░░░░░░░░░░░░░░░░░░▒███████████████████████████████
██████████████████████████▓░▓██▓▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░▒███████████████████████████████
██████████████████████████▓▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒███████████████████████████████
█████████████████████████████████████████████████████████████████████████████████████████████
█████████▓▓████▓▓▓▓▓▓▓▓▓▓████████▓▓███▓▓▓▓▓███▓██████████▓▓▓████████▓▓▓▓████████▓▓████▓▓▓▓███
███▓▓▓▓▓██▓▓██▓▓▓▓▓▓▓▓▓▓▓██▓▓▓▓██▓▓▓██▓▓▓▓▓██▓▓███▓▓▓▓▓██▓▓▓██▓▓▓▓▓██▓▓▓██▓▓▓▓▓█▓▓▓██▓▓▓▓▓██▓
██▓▓▒▓▓▓██▓▓██▓▓▒▓▓▓▓▓▓▓██▓▓▓▒▓▓██▓▓▓██▓▓▓██▓▓▓███▒▓▓▓▒███▒███▓▓▓▓▓██▓▓██▓▓▒▓▓▓██▓▓██▓▓▓▓███▒
█████████▒░▒██▒░░░░░░░░▒██▒░░░░▒██▒░░░█████▒░░░▓████████▒░░██▓░░░░░██▓░██▓░░░░░░░░▒██████▒░░░
██▓▒▒▒▒▒▒▒▒▒██▓▒▒▒▒▒▒▒▒▒██████████▒▒▒▒▒▓██▒▒▒▒▒▓██▒▒▒▒▒██▓▒██████████▓▒██▓▒▒▒▒▒██▒▒██▓▒▒▒▓█▓▒
██▓▒▒▒▒▒▒▒▒▒██▓▓▓▓▓▓▓▓▒▒██▓▓▓▓▓▓██▒▒▒▒▒▓██▒▒▒▒▒▓██▓▓▓▓▓█▓▓▒███▓▓▓▓▓██▓▒▓█▓▓▓▓▓▓█▓▒▒██▒▒▒▒▒▓█▓
██▓▒▒▒▒▒▒▒▒▒██████████▓▓██▒▒▒▒▒▓██▒▒▒▒▒▓██▒▒▒▒▒▓████████▓▒▒██▓▒▒▒▒▒██▓▒▒▓███████▒▒▒██▓▒▒▒▒▓██
█████████████████████████████████████████████████████████████████████████████████████████████


```

import requests
import numpy as np
import cv2
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModel
import torch.onnx

from giza.agents.model import GizaModel

# -----------------------------------------------------------------------------
# Helper functions for downloading and processing data
# -----------------------------------------------------------------------------

def download_image(image_url, filename):
    """Downloads an image from a given URL and saves it to a file.

    Args:
        image_url (str): The URL of the image.
        filename (str): The name of the file to save the image to.
    """
    image_data = requests.get(image_url).content
    with open(filename, 'wb') as handler:
        handler.write(image_data)

def download_task(task_url, task_filename):
    """Downloads a task description from a given URL and saves it to a file.

    Args:
        task_url (str): The URL of the task description.
        task_filename (str): The name of the file to save the task description to.
    """
    task_data = requests.get(task_url).content
    with open(task_filename, 'wb') as handler:
        handler.write(task_data)

def download_model(model_url, model_filename):
    """Downloads a model from a given URL and saves it to a file.

    Args:
        model_url (str): The URL of the model.
        model_filename (str): The name of the file to save the model to.
    """
    model_data = requests.get(model_url).content
    with open(model_filename, 'wb') as handler:
        handler.write(model_data)

def read_task(task_filename):
    """Reads a task description from a file.

    Args:
        task_filename (str): The name of the file containing the task description.

    Returns:
        list: A list of strings representing the lines of the task description.
    """
    with open(task_filename) as f:
        task = [l.rstrip() for l in f]
    return task

def get_image(path):
    """Loads an image from a given path and converts it to RGB format.

    Args:
        path (str): The path to the image file.

    Returns:
        numpy.ndarray: The image data as a NumPy array.
    """
    with Image.open(path) as img:
        img = np.array(img.convert('RGB'))
    return img

def preprocess(img):
    """Preprocesses an image for the BLIP model.

    Args:
        img (numpy.ndarray): The image data as a NumPy array.

    Returns:
        numpy.ndarray: The preprocessed image data.
    """
    img = img / 255.0
    img = cv2.resize(img, (256, 256))
    h, w = img.shape[0], img.shape[1]
    y0 = (h - 224) // 2
    x0 = (w - 224) // 2
    img = img[y0:y0 + 224, x0:x0 + 224, :]
    img = (img - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
    img = np.transpose(img, axes=[2, 0, 1])
    img = img.astype(np.float32)
    img = np.expand_dims(img, axis=0)
    return img

def predict_caption(model, processor, img):
    """Generates a caption for an image using the BLIP model.

    Args:
        model (BlipForConditionalGeneration): The BLIP model.
        processor (BlipProcessor): The BLIP processor.
        img (numpy.ndarray): The preprocessed image data.

    Returns:
        str: The generated caption.
    """
    inputs = processor(images=img, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

def get_embeddings(text, model, tokenizer):
    """Generates text embeddings using a pre-trained Sentence Transformer model.

    Args:
        text (str): The text to generate embeddings for.
        model (AutoModel): The Sentence Transformer model.
        tokenizer (AutoTokenizer): The tokenizer for the model.

    Returns:
        numpy.ndarray: The text embeddings.
    """
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

# -----------------------------------------------------------------------------
# Playback Giza execution script
# -----------------------------------------------------------------------------

def execution():
    """Executes the Playback Giza script.

    This script downloads images, a task description, and a Giza model,
    generates captions for the images, calculates embeddings for the captions
    and task description, and measures the similarity between the embeddings.

    Returns:
        tuple: A tuple containing the Giza model and the BLIP processor.
    """
    verifiable = False

    # Path to the ONNX model for image captioning
    model_path = "blip_image_captioning_model.onnx"

    # URLs for images and their filenames
    image_urls = [
        'https://s3.amazonaws.com/model-server/inputs/0xc0d08ed5b0f759cbc528abf16ae6e2fb33f935379a7b1fa2182753f1019fa721_0.jpg',
        'https://s3.amazonaws.com/model-server/inputs/0xc0d08ed5b0f759cbc528abf16ae6e2fb33f935379a7b1fa2182753f1019fa721_1.jpg'
    ]
    image_filenames = ['kitten_0.jpg', 'kitten_1.jpg']

    # Download images
    for url, filename in zip(image_urls, image_filenames):
        download_image(url, filename)

    # Download ONNX model for image classification
    download_model(
        'https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v1-12.onnx',
        'resnet50-v1-12.onnx'
    )

    # Load BLIP model for image captioning
    model_name = "Salesforce/blip-image-captioning-base"
    processor = BlipProcessor.from_pretrained(model_name)
    caption_model = BlipForConditionalGeneration.from_pretrained(model_name)

    # Download task description
    task_filename = '0xc0d08ed5b0f759cbc528abf16ae6e2fb33f935379a7b1fa2182753f1019fa721.txt'
    download_task(
        'https://s3.amazonaws.com/PLAYBACKGIZAS3/0xc0d08ed5b0f759cbc528abf16ae6e2fb33f935379a7b1fa2182753f1019fa721.txt',
        task_filename
    )
    task = read_task(task_filename)

    # Load Giza model with ONNX runtime
    caption_model = GizaModel(model_path=model_path)

    # Generate captions for images
    captions = []
    for img_filename in image_filenames:
        img = get_image(img_filename)
        img = preprocess(img)
        caption = predict_caption(caption_model, processor, img)
        captions.append(caption)

    # Combine embeddings for captions
    embedding_model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
    embedding_model = AutoModel.from_pretrained(embedding_model_name)

    embeddings = [get_embeddings(caption, embedding_model, tokenizer) for caption in captions]
    playback_embedding = np.mean(embeddings, axis=0)

    # Generate embedding for task description
    task_description = "Create a Giza wallet"
    task_embedding = get_embeddings(task_description, embedding_model, tokenizer)

    # Calculate similarity between embeddings as valuation
    similarity = np.dot(playback_embedding, task_embedding) / (np.linalg.norm(playback_embedding) * np.linalg.norm(task_embedding))
    print(f"Similarity between playback embedding and task description: {similarity}")

    # multiply by 100 to get a $BACK token offer
    valuation = int(similarity*100)

    return valuation, caption_model, processor

# Execute the script
caption_model, processor = execution()