huggingface · yiyixuxu · May 10, 2024 · May 7, 2024 · May 7, 2024 · May 7, 2024
diff --git a/docs/README.md b/docs/README.md
@@ -242,10 +242,10 @@ Here's an example of a tuple return, comprising several objects:
 
 ```
     Returns:
-        `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
-        - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
+        `tuple(torch.Tensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
+        - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.Tensor` of shape `(1,)` --
           Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
-        - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
+        - **prediction_scores** (`torch.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
           Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
 ```
 

diff --git a/docs/source/en/optimization/memory.md b/docs/source/en/optimization/memory.md
@@ -261,7 +261,7 @@ from dataclasses import dataclass
 
 @dataclass
 class UNet2DConditionOutput:
-    sample: torch.FloatTensor
+    sample: torch.Tensor
 
 
 pipe = StableDiffusionPipeline.from_pretrained(

diff --git a/docs/source/ko/optimization/fp16.md b/docs/source/ko/optimization/fp16.md
@@ -339,7 +339,7 @@ from dataclasses import dataclass
 
 @dataclass
 class UNet2DConditionOutput:
-    sample: torch.FloatTensor
+    sample: torch.Tensor
 
 
 pipe = StableDiffusionPipeline.from_pretrained(

diff --git a/examples/community/bit_diffusion.py b/examples/community/bit_diffusion.py
@@ -44,9 +44,9 @@ def bits_to_decimal(x, bits=BITS):
 # modified scheduler step functions for clamping the predicted x_0 between -bit_scale and +bit_scale
 def ddim_bit_scheduler_step(
     self,
-    model_output: torch.FloatTensor,
+    model_output: torch.Tensor,
     timestep: int,
-    sample: torch.FloatTensor,
+    sample: torch.Tensor,
     eta: float = 0.0,
     use_clipped_model_output: bool = True,
     generator=None,
@@ -56,9 +56,9 @@ def ddim_bit_scheduler_step(
     Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
     process from the learned model outputs (most often the predicted noise).
     Args:
-        model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+        model_output (`torch.Tensor`): direct output from learned diffusion model.
         timestep (`int`): current discrete timestep in the diffusion chain.
-        sample (`torch.FloatTensor`):
+        sample (`torch.Tensor`):
             current instance of sample being created by diffusion process.
         eta (`float`): weight of noise for added noise in diffusion step.
         use_clipped_model_output (`bool`): TODO
@@ -134,9 +134,9 @@ def ddim_bit_scheduler_step(
 
 def ddpm_bit_scheduler_step(
     self,
-    model_output: torch.FloatTensor,
+    model_output: torch.Tensor,
     timestep: int,
-    sample: torch.FloatTensor,
+    sample: torch.Tensor,
     prediction_type="epsilon",
     generator=None,
     return_dict: bool = True,
@@ -145,9 +145,9 @@ def ddpm_bit_scheduler_step(
     Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
     process from the learned model outputs (most often the predicted noise).
     Args:
-        model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+        model_output (`torch.Tensor`): direct output from learned diffusion model.
         timestep (`int`): current discrete timestep in the diffusion chain.
-        sample (`torch.FloatTensor`):
+        sample (`torch.Tensor`):
             current instance of sample being created by diffusion process.
         prediction_type (`str`, default `epsilon`):
             indicates whether the model predicts the noise (epsilon), or the samples (`sample`).

diff --git a/examples/community/clip_guided_images_mixing_stable_diffusion.py b/examples/community/clip_guided_images_mixing_stable_diffusion.py
@@ -233,8 +233,8 @@ def cond_fn(
     @torch.no_grad()
     def __call__(
         self,
-        style_image: Union[torch.FloatTensor, PIL.Image.Image],
-        content_image: Union[torch.FloatTensor, PIL.Image.Image],
+        style_image: Union[torch.Tensor, PIL.Image.Image],
+        content_image: Union[torch.Tensor, PIL.Image.Image],
         style_prompt: Optional[str] = None,
         content_prompt: Optional[str] = None,
         height: Optional[int] = 512,

diff --git a/examples/community/clip_guided_stable_diffusion.py b/examples/community/clip_guided_stable_diffusion.py
@@ -180,7 +180,7 @@ def __call__(
         num_cutouts: Optional[int] = 4,
         use_cutouts: Optional[bool] = True,
         generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ):

diff --git a/examples/community/clip_guided_stable_diffusion_img2img.py b/examples/community/clip_guided_stable_diffusion_img2img.py
@@ -306,7 +306,7 @@ def __call__(
         prompt: Union[str, List[str]],
         height: Optional[int] = 512,
         width: Optional[int] = 512,
-        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
         strength: float = 0.8,
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
@@ -317,7 +317,7 @@ def __call__(
         num_cutouts: Optional[int] = 4,
         use_cutouts: Optional[bool] = True,
         generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ):

diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py
@@ -354,10 +354,10 @@ def __call__(
         num_images_per_prompt: Optional[int] = 1,
         eta: float = 0.0,
         generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
         callback_steps: int = 1,
         weights: Optional[str] = "",
     ):
@@ -391,7 +391,7 @@ def __call__(
             generator (`torch.Generator`, *optional*):
                 A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
                 deterministic.
-            latents (`torch.FloatTensor`, *optional*):
+            latents (`torch.Tensor`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
@@ -403,7 +403,7 @@ def __call__(
                 plain tuple.
             callback (`Callable`, *optional*):
                 A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.

diff --git a/examples/community/ddim_noise_comparative_analysis.py b/examples/community/ddim_noise_comparative_analysis.py
@@ -103,7 +103,7 @@ def prepare_latents(self, image, timestep, batch_size, dtype, device, generator=
     @torch.no_grad()
     def __call__(
         self,
-        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
         strength: float = 0.8,
         batch_size: int = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
@@ -115,7 +115,7 @@ def __call__(
     ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
         Args:
-            image (`torch.FloatTensor` or `PIL.Image.Image`):
+            image (`torch.Tensor` or `PIL.Image.Image`):
                 `Image`, or tensor representing an image batch, that will be used as the starting point for the
                 process.
             strength (`float`, *optional*, defaults to 0.8):

diff --git a/examples/community/gluegen.py b/examples/community/gluegen.py
@@ -205,7 +205,7 @@ def __init__(
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
         language_adapter: TranslatorNoLN = None,
-        tensor_norm: torch.FloatTensor = None,
+        tensor_norm: torch.Tensor = None,
         requires_safety_checker: bool = True,
     ):
         super().__init__()
@@ -231,7 +231,7 @@ def load_language_adapter(
         num_token: int,
         dim: int,
         dim_out: int,
-        tensor_norm: torch.FloatTensor,
+        tensor_norm: torch.Tensor,
         mult: int = 2,
         depth: int = 5,
     ):
@@ -242,7 +242,7 @@ def load_language_adapter(
         )
         self.language_adapter.load_state_dict(torch.load(model_path))
 
-    def _adapt_language(self, prompt_embeds: torch.FloatTensor):
+    def _adapt_language(self, prompt_embeds: torch.Tensor):
         prompt_embeds = prompt_embeds / 3
         prompt_embeds = self.language_adapter(prompt_embeds) * (self.tensor_norm / 2)
         return prompt_embeds
@@ -254,8 +254,8 @@ def encode_prompt(
         num_images_per_prompt,
         do_classifier_free_guidance,
         negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
         lora_scale: Optional[float] = None,
         clip_skip: Optional[int] = None,
     ):
@@ -275,10 +275,10 @@ def encode_prompt(
                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
                 `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                 less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
+            prompt_embeds (`torch.Tensor`, *optional*):
                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                 provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                 weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                 argument.
@@ -535,7 +535,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32
                 data type of the generated embeddings
 
         Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+            `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
         """
         assert len(w.shape) == 1
         w = w * 1000.0
@@ -594,9 +594,9 @@ def __call__(
         num_images_per_prompt: Optional[int] = 1,
         eta: float = 0.0,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -635,14 +635,14 @@ def __call__(
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                 generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
+            latents (`torch.Tensor`, *optional*):
                 Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
+            prompt_embeds (`torch.Tensor`, *optional*):
                 Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                 provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
             ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.

diff --git a/examples/community/hd_painter.py b/examples/community/hd_painter.py
@@ -28,10 +28,10 @@ def __init__(self, mask, token_idx, scale_factor):
     def __call__(
         self,
         attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        temb: Optional[torch.FloatTensor] = None,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
         scale: float = 1.0,
     ) -> torch.Tensor:
         # Same as the default AttnProcessor up untill the part where similarity matrix gets saved
@@ -111,10 +111,10 @@ def __init__(self, transformer_block, mask, token_idx, do_classifier_free_guidan
     def __call__(
         self,
         attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        temb: Optional[torch.FloatTensor] = None,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
         scale: float = 1.0,
     ) -> torch.Tensor:
         # Automatically recognize the resolution of the current attention layer and resize the masks accordingly
@@ -454,7 +454,7 @@ def __call__(
         prompt: Union[str, List[str]] = None,
         image: PipelineImageInput = None,
         mask_image: PipelineImageInput = None,
-        masked_image_latents: torch.FloatTensor = None,
+        masked_image_latents: torch.Tensor = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         padding_mask_crop: Optional[int] = None,
@@ -467,9 +467,9 @@ def __call__(
         num_images_per_prompt: Optional[int] = 1,
         eta: float = 0.01,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
         ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,

diff --git a/examples/community/iadb.py b/examples/community/iadb.py
@@ -17,21 +17,21 @@ class IADBScheduler(SchedulerMixin, ConfigMixin):
 
     def step(
         self,
-        model_output: torch.FloatTensor,
+        model_output: torch.Tensor,
         timestep: int,
-        x_alpha: torch.FloatTensor,
-    ) -> torch.FloatTensor:
+        x_alpha: torch.Tensor,
+    ) -> torch.Tensor:
         """
         Predict the sample at the previous timestep by reversing the ODE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
 
         Args:
-            model_output (`torch.FloatTensor`): direct output from learned diffusion model. It is the direction from x0 to x1.
+            model_output (`torch.Tensor`): direct output from learned diffusion model. It is the direction from x0 to x1.
             timestep (`float`): current timestep in the diffusion chain.
-            x_alpha (`torch.FloatTensor`): x_alpha sample for the current timestep
+            x_alpha (`torch.Tensor`): x_alpha sample for the current timestep
 
         Returns:
-            `torch.FloatTensor`: the sample at the previous timestep
+            `torch.Tensor`: the sample at the previous timestep
 
         """
         if self.num_inference_steps is None:
@@ -53,10 +53,10 @@ def set_timesteps(self, num_inference_steps: int):
 
     def add_noise(
         self,
-        original_samples: torch.FloatTensor,
-        noise: torch.FloatTensor,
-        alpha: torch.FloatTensor,
-    ) -> torch.FloatTensor:
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        alpha: torch.Tensor,
+    ) -> torch.Tensor:
         return original_samples * alpha + noise * (1 - alpha)
 
     def __len__(self):

diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py
@@ -110,7 +110,7 @@ def __init__(
     def train(
         self,
         prompt: Union[str, List[str]],
-        image: Union[torch.FloatTensor, PIL.Image.Image],
+        image: Union[torch.Tensor, PIL.Image.Image],
         height: Optional[int] = 512,
         width: Optional[int] = 512,
         generator: Optional[torch.Generator] = None,
@@ -144,7 +144,7 @@ def train(
             generator (`torch.Generator`, *optional*):
                 A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
                 deterministic.
-            latents (`torch.FloatTensor`, *optional*):
+            latents (`torch.Tensor`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.