From 94d7327735a2921b4da37939a5d61cd6fb9b525a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 5 Feb 2024 11:13:20 -0800
Subject: [PATCH 01/14] Bump the github-actions group with 3 updates (#2330)

Bumps the github-actions group with 3 updates: [actions/cache](https://github.com/actions/cache), [actions/upload-artifact](https://github.com/actions/upload-artifact) and [github/codeql-action](https://github.com/github/codeql-action).


Updates `actions/cache` from 3 to 4
- [Release notes](https://github.com/actions/cache/releases)
- [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md)
- [Commits](https://github.com/actions/cache/compare/v3...v4)

Updates `actions/upload-artifact` from 4.0.0 to 4.3.0
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/c7d193f32edcb7bfad88892161225aeda64e9392...26f96dfa697d77e81fd5907df203aa23a56210a8)

Updates `github/codeql-action` from 3.22.12 to 3.23.2
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/012739e5082ff0c22ca6d6ab32e07c36df03c4a4...b7bf0a3ed3ecfa44160715d7c442788f65f0f923)

---
updated-dependencies:
- dependency-name: actions/cache
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: github-actions
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: github-actions
- dependency-name: github/codeql-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: github-actions
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/actions.yml   | 6 +++---
 .github/workflows/nightly.yml   | 2 +-
 .github/workflows/release.yml   | 2 +-
 .github/workflows/scorecard.yml | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml
index a80bc1b735..8be69b967b 100644
--- a/.github/workflows/actions.yml
+++ b/.github/workflows/actions.yml
@@ -26,7 +26,7 @@ jobs:
         python -m pip install --upgrade pip setuptools
         echo "::set-output name=dir::$(pip cache dir)"
     - name: pip cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ${{ steps.pip-cache.outputs.dir }}
         key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
@@ -65,7 +65,7 @@ jobs:
         python -m pip install --upgrade pip setuptools
         echo "::set-output name=dir::$(pip cache dir)"
     - name: pip cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ${{ steps.pip-cache.outputs.dir }}
         key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
@@ -110,7 +110,7 @@ jobs:
         python -m pip install --upgrade pip setuptools
         echo "::set-output name=dir::$(pip cache dir)"
     - name: pip cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ${{ steps.pip-cache.outputs.dir }}
         key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 4ec23461b3..ded0a461b2 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -27,7 +27,7 @@ jobs:
           python -m pip install --upgrade pip setuptools
           echo "::set-output name=dir::$(pip cache dir)"
       - name: pip cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ${{ steps.pip-cache.outputs.dir }}
           key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index afe38eb519..7a471e938a 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -25,7 +25,7 @@ jobs:
           python -m pip install --upgrade pip setuptools
           echo "::set-output name=dir::$(pip cache dir)"
       - name: pip cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ${{ steps.pip-cache.outputs.dir }}
           key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index ff310c9dee..98509aef93 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -45,7 +45,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@c7d193f32edcb7bfad88892161225aeda64e9392 # v4.0.0
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
         with:
           name: SARIF file
           path: results.sarif
@@ -53,6 +53,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@012739e5082ff0c22ca6d6ab32e07c36df03c4a4 # v3.22.12
+        uses: github/codeql-action/upload-sarif@b7bf0a3ed3ecfa44160715d7c442788f65f0f923 # v3.23.2
         with:
           sarif_file: results.sarif

From 160d2a9506c2f4cb9a841fd3020af9f16a533539 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Wed, 7 Feb 2024 23:26:44 +0000
Subject: [PATCH 02/14] update text model

---
 .../feature_extractor/clip/clip_text_model.py       | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py
index 3665e0b741..345d6a3c2c 100644
--- a/keras_cv/models/feature_extractor/clip/clip_text_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py
@@ -44,13 +44,24 @@ def __init__(
 
     def call(self, inputs):
         token_embedding = self.token_embedding(inputs)
+        input_shape = token_embedding.shape
+        position_ids = ops.expand_dims(
+            ops.arange(start=0, stop=input_shape[-1]), axis=0
+        )
+        position_embeds = ops.take(
+            self.positional_embedding, indices=position_ids
+        )
+        position_embeds = ops.tile(
+            position_embeds, repeats=(input_shape[0], 1, 1)
+        )
         encoded_output = self.encoder(
-            token_embedding + self.positional_embedding
+            token_embedding + position_embeds
         )
         layer_norm = self.ln_final(encoded_output)
         indices = ops.expand_dims(
             ops.cast(ops.argmax(inputs, axis=1), "int32"), axis=-1
         )
+        print("incides", indices)
         selected_features = ops.take_along_axis(
             layer_norm, indices[:, :, None], axis=1
         )

From 681120c513931483e7bae48be4da49a920812d5b Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Thu, 8 Feb 2024 02:27:38 +0000
Subject: [PATCH 03/14] update text encoder

---
 .../feature_extractor/clip/clip_model.py      |  6 -----
 .../feature_extractor/clip/clip_text_model.py | 25 +++++--------------
 2 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_model.py b/keras_cv/models/feature_extractor/clip/clip_model.py
index bf0740c4e9..f56a7609e4 100644
--- a/keras_cv/models/feature_extractor/clip/clip_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_model.py
@@ -146,12 +146,6 @@ def __init__(
         self.image_embeddings = None
         self.text_embeddings = None
 
-    def build_attention_mask(self):
-        mask = ops.ones((self.context_length, self.context_length))
-        # Zero out the lower diagonal
-        mask = ops.triu(mask)
-        return ops.cast(mask, "float32")
-
     def encode_images(self, image):
         return self.image_encoder(image)
 
diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py
index 345d6a3c2c..efe0ac49cb 100644
--- a/keras_cv/models/feature_extractor/clip/clip_text_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py
@@ -29,11 +29,15 @@ def __init__(
             shape=[self.context_length, transformer_width],
             name="positional_embedding",
         )
+        mask = ops.ones((self.context_length, self.context_length))
+        # Zero out the lower diagonal
+        mask = ops.triu(mask)
+        mask = ops.cast(mask, "float32")
         self.encoder = CLIPEncoder(
             width=transformer_width,
             layers=transformer_layers,
             heads=transformer_heads,
-            attn_mask=self.build_attention_mask(),
+            attn_mask=mask,
             name="clip_encoder",
         )
         self.ln_final = keras.layers.LayerNormalization(name="ln_final")
@@ -44,33 +48,16 @@ def __init__(
 
     def call(self, inputs):
         token_embedding = self.token_embedding(inputs)
-        input_shape = token_embedding.shape
-        position_ids = ops.expand_dims(
-            ops.arange(start=0, stop=input_shape[-1]), axis=0
-        )
-        position_embeds = ops.take(
-            self.positional_embedding, indices=position_ids
-        )
-        position_embeds = ops.tile(
-            position_embeds, repeats=(input_shape[0], 1, 1)
-        )
         encoded_output = self.encoder(
-            token_embedding + position_embeds
+            token_embedding + self.positional_embedding
         )
         layer_norm = self.ln_final(encoded_output)
         indices = ops.expand_dims(
             ops.cast(ops.argmax(inputs, axis=1), "int32"), axis=-1
         )
-        print("incides", indices)
         selected_features = ops.take_along_axis(
             layer_norm, indices[:, :, None], axis=1
         )
         text_features = self.text_projector(selected_features)
         output = ops.squeeze(text_features, axis=1)
         return output
-
-    def build_attention_mask(self):
-        mask = ops.ones((self.context_length, self.context_length))
-        # Zero out the lower diagonal
-        mask = ops.triu(mask)
-        return ops.cast(mask, "float32")

From df73f2343261d844a0bf3a2c117e0ab03ec25f26 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Thu, 8 Feb 2024 06:12:45 +0000
Subject: [PATCH 04/14] update position embeddings

---
 .../models/feature_extractor/clip/clip_text_model.py     | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py
index efe0ac49cb..b95dd518d4 100644
--- a/keras_cv/models/feature_extractor/clip/clip_text_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py
@@ -25,8 +25,9 @@ def __init__(
         )
 
         self.vocab_size = vocab_size
-        self.positional_embedding = self.add_weight(
-            shape=[self.context_length, transformer_width],
+        self.positional_embedding = keras.layers.Embedding(
+            self.context_length,
+            transformer_width,
             name="positional_embedding",
         )
         mask = ops.ones((self.context_length, self.context_length))
@@ -48,8 +49,10 @@ def __init__(
 
     def call(self, inputs):
         token_embedding = self.token_embedding(inputs)
+        position_ids = ops.expand_dims(ops.arange(self.context_length, dtype="int32"), 0)
+        position_embedding = self.positional_embedding(position_ids)
         encoded_output = self.encoder(
-            token_embedding + self.positional_embedding
+            token_embedding + position_embedding
         )
         layer_norm = self.ln_final(encoded_output)
         indices = ops.expand_dims(

From 80bde9c641c4a44f7f8e331e555dc3dda16ae063 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Thu, 8 Feb 2024 06:35:22 +0000
Subject: [PATCH 05/14] update positonal embeddings

---
 .../models/feature_extractor/clip/clip_text_model.py     | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py
index b95dd518d4..41a0f57ca7 100644
--- a/keras_cv/models/feature_extractor/clip/clip_text_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py
@@ -49,11 +49,14 @@ def __init__(
 
     def call(self, inputs):
         token_embedding = self.token_embedding(inputs)
-        position_ids = ops.expand_dims(ops.arange(self.context_length, dtype="int32"), 0)
+        position_ids = ops.expand_dims(
+            ops.arange(self.context_length, dtype="int32"), 0
+        )
         position_embedding = self.positional_embedding(position_ids)
-        encoded_output = self.encoder(
-            token_embedding + position_embedding
+        position_embedding = ops.tile(
+            position_embedding, repeats=(inputs.shape[0], 1, 1)
         )
+        encoded_output = self.encoder(token_embedding + position_embedding)
         layer_norm = self.ln_final(encoded_output)
         indices = ops.expand_dims(
             ops.cast(ops.argmax(inputs, axis=1), "int32"), axis=-1

From 5f7b23bfcc28764c342be07e747a794c599d2dbd Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Thu, 8 Feb 2024 17:37:28 +0000
Subject: [PATCH 06/14] add attention masks

---
 .../feature_extractor/clip/clip_encoder.py    | 38 ++++++++++++-------
 .../feature_extractor/clip/clip_model.py      | 10 +++--
 .../feature_extractor/clip/clip_processor.py  |  3 +-
 .../feature_extractor/clip/clip_text_model.py | 11 +++++-
 4 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_encoder.py b/keras_cv/models/feature_extractor/clip/clip_encoder.py
index 653189ca7d..1be8ff1991 100644
--- a/keras_cv/models/feature_extractor/clip/clip_encoder.py
+++ b/keras_cv/models/feature_extractor/clip/clip_encoder.py
@@ -58,14 +58,23 @@ def __init__(
             * 0.02
         )
 
-    def attention(self, x):
+    def attention(self, x, attention_mask=None):
         self.attn_mask = (
             ops.cast(self.attn_mask, dtype=x.dtype)
             if self.attn_mask is not None
             else None
         )
+        attention_mask = (
+            ops.cast(attention_mask, dtype=x.dtype)
+            if attention_mask is not None
+            else None
+        )
 
-        return self.attn(x, attention_mask=self.attn_mask)
+        return self.attn(
+            x,
+            attention_mask=attention_mask,
+            causal_attention_mask=self.attn_mask,
+        )
 
     def build(self, input_shape):
         super().build(input_shape)
@@ -93,8 +102,8 @@ def build(self, input_shape):
         )
         self.ln_2 = keras.layers.LayerNormalization(epsilon=1e-5, name="ln_2")
 
-    def call(self, x):
-        x = x + self.attention(self.ln_1(x))
+    def call(self, x, attention_mask=None):
+        x = x + self.attention(self.ln_1(x), attention_mask=attention_mask)
         x = x + self.mlp(self.ln_2(x))
         return x
 
@@ -109,20 +118,21 @@ def __init__(self, width, layers, heads, attn_mask=None, **kwargs):
         self.layers = layers
         self.heads = heads
         self.attn_mask = attn_mask
-        self.resblocks = keras.Sequential(
-            [
-                ResidualAttention(
-                    self.width, self.heads, self.layers, self.attn_mask
-                )
-                for _ in range(self.layers)
-            ]
-        )
+        self.resblocks = [
+            ResidualAttention(
+                self.width, self.heads, self.layers, self.attn_mask
+            )
+            for _ in range(self.layers)
+        ]
 
     def build(self, input_shape):
         super().build(input_shape)
+        self.resblocks.build()
 
-    def call(self, x):
-        return self.resblocks(x)
+    def call(self, x, attention_mask=None):
+        for block in self.resblocks:
+            x = block(x, attention_mask=attention_mask)
+        return x
 
     def compute_output_shape(self, inputs_shape):
         return inputs_shape
diff --git a/keras_cv/models/feature_extractor/clip/clip_model.py b/keras_cv/models/feature_extractor/clip/clip_model.py
index f56a7609e4..04dd816f07 100644
--- a/keras_cv/models/feature_extractor/clip/clip_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_model.py
@@ -149,12 +149,14 @@ def __init__(
     def encode_images(self, image):
         return self.image_encoder(image)
 
-    def encode_text(self, text):
-        return self.text_encoder(text)
+    def encode_text(self, text, attention_mask=None):
+        return self.text_encoder(text, attention_mask=attention_mask)
 
-    def call(self, image, text):
+    def call(self, image, text, attention_mask=None):
         self.image_embeddings = self.encode_images(image)
-        self.text_embeddings = self.encode_text(text)
+        self.text_embeddings = self.encode_text(
+            text, attention_mask=attention_mask
+        )
         normalize_image_features = keras.ops.sqrt(
             keras.ops.sum(
                 keras.ops.power(self.image_embeddings, 2), keepdims=True
diff --git a/keras_cv/models/feature_extractor/clip/clip_processor.py b/keras_cv/models/feature_extractor/clip/clip_processor.py
index 5505e87f11..80183fcb0e 100644
--- a/keras_cv/models/feature_extractor/clip/clip_processor.py
+++ b/keras_cv/models/feature_extractor/clip/clip_processor.py
@@ -109,12 +109,11 @@ def process_texts(self, texts, context_length: int = 77):
             texts = [texts]
 
         def pack_tokens(text):
-            tok, _ = self.packer(
+            return self.packer(
                 self.tokenizer(text),
                 sequence_length=context_length,
                 add_start_value=True,
                 add_end_value=True,
             )
-            return tok
 
         return pack_tokens(texts)
diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py
index 41a0f57ca7..bb096c805d 100644
--- a/keras_cv/models/feature_extractor/clip/clip_text_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py
@@ -47,7 +47,7 @@ def __init__(
             embed_dim, name="text_projector", use_bias=False
         )
 
-    def call(self, inputs):
+    def call(self, inputs, attention_mask=None):
         token_embedding = self.token_embedding(inputs)
         position_ids = ops.expand_dims(
             ops.arange(self.context_length, dtype="int32"), 0
@@ -56,7 +56,14 @@ def call(self, inputs):
         position_embedding = ops.tile(
             position_embedding, repeats=(inputs.shape[0], 1, 1)
         )
-        encoded_output = self.encoder(token_embedding + position_embedding)
+        attention_mask = ops.cast(attention_mask, dtype="float32")
+        expanded_mask = ops.tile(
+            attention_mask[:, None, None, :], (1, 1, self.context_length, 1)
+        )
+        expanded_mask = (1.0 - expanded_mask) * (-1e8)
+        encoded_output = self.encoder(
+            token_embedding + position_embedding, attention_mask=expanded_mask
+        )
         layer_norm = self.ln_final(encoded_output)
         indices = ops.expand_dims(
             ops.cast(ops.argmax(inputs, axis=1), "int32"), axis=-1

From 7530eed43c765e35fbe5e7529a1a5ea460956156 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Thu, 8 Feb 2024 19:15:05 +0000
Subject: [PATCH 07/14] update expanded mask

---
 keras_cv/models/feature_extractor/clip/clip_text_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py
index bb096c805d..eb2287ee80 100644
--- a/keras_cv/models/feature_extractor/clip/clip_text_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py
@@ -60,7 +60,7 @@ def call(self, inputs, attention_mask=None):
         expanded_mask = ops.tile(
             attention_mask[:, None, None, :], (1, 1, self.context_length, 1)
         )
-        expanded_mask = (1.0 - expanded_mask) * (-1e8)
+        # expanded_mask = (1.0 - expanded_mask) * (-1e8)
         encoded_output = self.encoder(
             token_embedding + position_embedding, attention_mask=expanded_mask
         )

From 0211bd47f8267c68a685f785eaeebf56405524f4 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Thu, 8 Feb 2024 19:24:46 +0000
Subject: [PATCH 08/14] revert previous commit

---
 keras_cv/models/feature_extractor/clip/clip_text_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py
index eb2287ee80..bb096c805d 100644
--- a/keras_cv/models/feature_extractor/clip/clip_text_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py
@@ -60,7 +60,7 @@ def call(self, inputs, attention_mask=None):
         expanded_mask = ops.tile(
             attention_mask[:, None, None, :], (1, 1, self.context_length, 1)
         )
-        # expanded_mask = (1.0 - expanded_mask) * (-1e8)
+        expanded_mask = (1.0 - expanded_mask) * (-1e8)
         encoded_output = self.encoder(
             token_embedding + position_embedding, attention_mask=expanded_mask
         )

From d488b7523af0fea4ac94cade1f84b763548b2a7e Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Thu, 8 Feb 2024 19:33:48 +0000
Subject: [PATCH 09/14] change causal masks

---
 keras_cv/models/feature_extractor/clip/clip_text_model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py
index bb096c805d..d98e13160b 100644
--- a/keras_cv/models/feature_extractor/clip/clip_text_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py
@@ -34,6 +34,7 @@ def __init__(
         # Zero out the lower diagonal
         mask = ops.triu(mask)
         mask = ops.cast(mask, "float32")
+        mask = (1.0 - mask) * (-1e8)
         self.encoder = CLIPEncoder(
             width=transformer_width,
             layers=transformer_layers,

From d9d126430d5e1ae3ae18291ce60f75c2b7d2532a Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Thu, 8 Feb 2024 19:42:00 +0000
Subject: [PATCH 10/14] undo previous commit

---
 keras_cv/models/feature_extractor/clip/clip_text_model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py
index d98e13160b..bb096c805d 100644
--- a/keras_cv/models/feature_extractor/clip/clip_text_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py
@@ -34,7 +34,6 @@ def __init__(
         # Zero out the lower diagonal
         mask = ops.triu(mask)
         mask = ops.cast(mask, "float32")
-        mask = (1.0 - mask) * (-1e8)
         self.encoder = CLIPEncoder(
             width=transformer_width,
             layers=transformer_layers,

From 64d66b54d172e16235e07993779086155921059a Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Thu, 8 Feb 2024 23:27:34 +0000
Subject: [PATCH 11/14] update attention masks

---
 keras_cv/models/feature_extractor/clip/clip_encoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_encoder.py b/keras_cv/models/feature_extractor/clip/clip_encoder.py
index 1be8ff1991..d1146985c3 100644
--- a/keras_cv/models/feature_extractor/clip/clip_encoder.py
+++ b/keras_cv/models/feature_extractor/clip/clip_encoder.py
@@ -69,11 +69,11 @@ def attention(self, x, attention_mask=None):
             if attention_mask is not None
             else None
         )
+        mask = ops.add(self.attn_mask, attention_mask)
 
         return self.attn(
             x,
-            attention_mask=attention_mask,
-            causal_attention_mask=self.attn_mask,
+            attention_mask=mask,
         )
 
     def build(self, input_shape):

From de0be1907fb66ac68e96b97548ef799bd34afebe Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Thu, 8 Feb 2024 23:51:02 +0000
Subject: [PATCH 12/14] update clip encoder

---
 .../models/feature_extractor/clip/clip_encoder.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_encoder.py b/keras_cv/models/feature_extractor/clip/clip_encoder.py
index d1146985c3..8be0cd05fa 100644
--- a/keras_cv/models/feature_extractor/clip/clip_encoder.py
+++ b/keras_cv/models/feature_extractor/clip/clip_encoder.py
@@ -59,17 +59,18 @@ def __init__(
         )
 
     def attention(self, x, attention_mask=None):
-        self.attn_mask = (
+        mask = (
             ops.cast(self.attn_mask, dtype=x.dtype)
             if self.attn_mask is not None
             else None
         )
-        attention_mask = (
-            ops.cast(attention_mask, dtype=x.dtype)
-            if attention_mask is not None
-            else None
-        )
-        mask = ops.add(self.attn_mask, attention_mask)
+        if attention_mask is not None:
+            attention_mask = (
+                ops.cast(attention_mask, dtype=x.dtype)
+                if attention_mask is not None
+                else None
+            )
+            mask = ops.add(self.attn_mask, attention_mask)
 
         return self.attn(
             x,

From 4b8c1efd8a9f5b8e03d38dd297797449f81ec82e Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Fri, 9 Feb 2024 00:48:53 +0000
Subject: [PATCH 13/14] add print statements

---
 keras_cv/models/feature_extractor/clip/clip_text_model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py
index bb096c805d..715c3a8dd3 100644
--- a/keras_cv/models/feature_extractor/clip/clip_text_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py
@@ -64,6 +64,7 @@ def call(self, inputs, attention_mask=None):
         encoded_output = self.encoder(
             token_embedding + position_embedding, attention_mask=expanded_mask
         )
+        print("encoded_output", encoded_output)
         layer_norm = self.ln_final(encoded_output)
         indices = ops.expand_dims(
             ops.cast(ops.argmax(inputs, axis=1), "int32"), axis=-1
@@ -71,6 +72,7 @@ def call(self, inputs, attention_mask=None):
         selected_features = ops.take_along_axis(
             layer_norm, indices[:, :, None], axis=1
         )
+        print("pooler output", selected_features)
         text_features = self.text_projector(selected_features)
         output = ops.squeeze(text_features, axis=1)
         return output

From 54f02e81b0bec5035b004673c5a4c9371286a339 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli>
Date: Fri, 9 Feb 2024 01:31:54 +0000
Subject: [PATCH 14/14] update the pooler output

---
 keras_cv/models/feature_extractor/clip/clip_text_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py
index 715c3a8dd3..fc765fc67b 100644
--- a/keras_cv/models/feature_extractor/clip/clip_text_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py
@@ -67,7 +67,7 @@ def call(self, inputs, attention_mask=None):
         print("encoded_output", encoded_output)
         layer_norm = self.ln_final(encoded_output)
         indices = ops.expand_dims(
-            ops.cast(ops.argmax(inputs, axis=1), "int32"), axis=-1
+            ops.cast(ops.argmax(inputs, axis=-1), "int32"), axis=-1
         )
         selected_features = ops.take_along_axis(
             layer_norm, indices[:, :, None], axis=1