ArashAkbarinia · ArashAkbarinia · Dec 18, 2023 · Dec 15, 2023 · Dec 15, 2023 · Dec 15, 2023
diff --git a/osculari/datasets/dataset_utils.py b/osculari/datasets/dataset_utils.py
@@ -186,6 +186,8 @@ def background_img(bg_type: Any, bg_size: Union[int, Tuple], im2double=True) ->
             num_colours = np.random.randint(3, 25)
             num_patches = np.random.randint(2, bg_size[0] // 20)
             bg_img = _patch_img(bg_size, num_colours, num_patches, channels)
+            if 'achromatic' in bg_type:
+                bg_img = np.repeat(bg_img, 3, axis=2)
         else:
             raise RuntimeError('Unsupported background type %s.' % bg_type)
     # Handle user-specified background values

diff --git a/osculari/datasets/geometrical_shapes.py b/osculari/datasets/geometrical_shapes.py
@@ -159,6 +159,7 @@ def __init__(
         self.num_samples = num_samples
         self.num_images = num_images
         self.img_size = img_size
+        assert callable(merge_fg_bg)
         self.merge_fg_bg = merge_fg_bg
         self.bg = background
         self.unique_fg_shape = unique_fg_shape

diff --git a/osculari/datasets/gratings.py b/osculari/datasets/gratings.py
@@ -91,7 +91,7 @@ class GratingsDataset(TorchDataset):
     """
 
     def __init__(self, img_size: int, spatial_frequencies: Optional[Sequence[int]] = None,
-                 thetas: Optional[Sequence[int]] = None, gaussian_sigma: Optional[float] = None,
+                 thetas: Optional[Sequence[float]] = None, gaussian_sigma: Optional[float] = None,
                  transform: Optional[Callable] = None) -> None:
         super(GratingsDataset, self).__init__()
         self.img_size = img_size

diff --git a/osculari/models/model_utils.py b/osculari/models/model_utils.py
@@ -103,16 +103,13 @@ def is_resnet_backbone(architecture: str) -> bool:
     return 'resnet' in architecture or 'resnext' in architecture or 'taskonomy_' in architecture
 
 
-def generic_features_size(model: nn.Module, img_size: int,
-                          is_clip: Optional[bool] = False) -> Tuple[int]:
+def generic_features_size(model: nn.Module, img_size: int) -> Tuple[int]:
     """
     Compute the output size of a neural network model given an input image size.
 
     Parameters:
         model (nn.Module): The neural network model.
         img_size (int): The input image size (assuming square images).
-        is_clip (Optional[bool]): Flag indicating whether the model is a CLIP model
-         (default is False).
 
     Returns:
         Tuple[int]: The computed output size of the model.
@@ -123,11 +120,6 @@ def generic_features_size(model: nn.Module, img_size: int,
     # Convert the image to a PyTorch tensor and add batch dimension
     img = torchvis_fun.to_tensor(img).unsqueeze(0)
 
-    # Move the input image to GPU and change the data type if is_clip is True
-    if is_clip:
-        img = img.cuda()
-        img = img.type(torch.float16)
-
     # Set the model to evaluation mode
     model.eval()
 

diff --git a/osculari/models/pretrained_models.py b/osculari/models/pretrained_models.py
@@ -11,9 +11,10 @@
 from torch.utils import model_zoo
 
 from torchvision import models as torch_models
+from visualpriors import taskonomy_network
 import clip
 
-from . import model_utils, pretrained_layers, taskonomy_network
+from . import model_utils, pretrained_layers
 
 _TORCHVISION_SEGMENTATION = [
     'deeplabv3_mobilenet_v3_large',
@@ -474,6 +475,7 @@ def get_pretrained_model(network_name: str, weights: str) -> nn.Module:
     Parameters:
         network_name (str): Name of the network.
         weights (str): Path to the pretrained weights file.
+        clip_cpu (bool): Load the CLIP model in CPU.
 
     Raises:
         RuntimeError: If the specified network is not supported.
@@ -488,7 +490,8 @@ def get_pretrained_model(network_name: str, weights: str) -> nn.Module:
         # Load CLIP model
         # TODO: support for None
         clip_version = network_name.replace('clip_', '')
-        model, _ = clip.load(clip_version)
+        device = "cuda" if torch.cuda.is_available() and weights not in ['none', None] else "cpu"
+        model, _ = clip.load(clip_version, device=device)
     elif 'taskonomy_' in network_name:
         # Load Taskonomy model
         model = taskonomy_network.TaskonomyEncoder()

diff --git a/osculari/models/readout.py b/osculari/models/readout.py
@@ -22,6 +22,8 @@
     "load_paradigm_2afc",
     "load_paradigm_ooo",
     "ProbeNet",
+    "OddOneOutNet",
+    "Classifier2AFC",
     "ActivationLoader",
     "FeatureExtractor"
 ]
@@ -229,6 +231,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.extract_features(x)
 
 
+def _image_encoder_none_weights(architecture: str, layer: str) -> nn.Module:
+    # TODO: consider converting this into a overload functoin for generic size
+    model_instance = pretraineds.get_pretrained_model(architecture, 'none')
+    image_encoder = pretraineds.get_image_encoder(architecture, model_instance)
+    layer_features = pretraineds.model_features(image_encoder, architecture, layer)
+    return layer_features
+
+
 class ProbeNet(ReadOutNet):
     """Adding a linear layer on top of readout features."""
 
@@ -262,22 +272,24 @@ def __init__(self, input_nodes: int, num_classes: int, img_size: int,
         }
 
         # Handle features from multiple layers
-        is_clip = 'clip' in self.architecture
         if hasattr(self, 'act_dict'):
             total_dim = 0
             for layer in self.layers:
-                model_instance = pretraineds.get_pretrained_model(self.architecture, 'none')
-                image_encoder = pretraineds.get_image_encoder(self.architecture, model_instance)
-                layer_features = pretraineds.model_features(image_encoder, self.architecture, layer)
-                odim = model_utils.generic_features_size(layer_features, img_size, is_clip)
+                image_encoder = _image_encoder_none_weights(self.architecture, layer)
+                odim = model_utils.generic_features_size(image_encoder, img_size)
                 if type(odim) is int:
                     total_dim += odim
                 else:
                     tmp_size = 1 if len(odim) < 3 else np.prod(self.pool['size']) * self.pool['num']
                     total_dim += (odim[0] * tmp_size)
             self.out_dim = (total_dim, 1)
         else:
-            self.out_dim = model_utils.generic_features_size(self.backbone, img_size, is_clip)
+            image_encoder = self.backbone
+            # To calculate the weights of the CLIP model, we load an instance in CPU by
+            # passing weights='none'
+            if 'clip' in self.architecture:
+                image_encoder = _image_encoder_none_weights(self.architecture, self.layers)
+            self.out_dim = model_utils.generic_features_size(image_encoder, img_size)
             if len(self.out_dim) == 1 and self.pool is not None:
                 RuntimeWarning(
                     'Layer %s output is a vector, no pooling can be applied' % self.layers