From 94d7327735a2921b4da37939a5d61cd6fb9b525a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Feb 2024 11:13:20 -0800 Subject: [PATCH 01/14] Bump the github-actions group with 3 updates (#2330) Bumps the github-actions group with 3 updates: [actions/cache](https://github.com/actions/cache), [actions/upload-artifact](https://github.com/actions/upload-artifact) and [github/codeql-action](https://github.com/github/codeql-action). Updates `actions/cache` from 3 to 4 - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](https://github.com/actions/cache/compare/v3...v4) Updates `actions/upload-artifact` from 4.0.0 to 4.3.0 - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/c7d193f32edcb7bfad88892161225aeda64e9392...26f96dfa697d77e81fd5907df203aa23a56210a8) Updates `github/codeql-action` from 3.22.12 to 3.23.2 - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/012739e5082ff0c22ca6d6ab32e07c36df03c4a4...b7bf0a3ed3ecfa44160715d7c442788f65f0f923) --- updated-dependencies: - dependency-name: actions/cache dependency-type: direct:production update-type: version-update:semver-major dependency-group: github-actions - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-minor dependency-group: github-actions - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-minor dependency-group: github-actions ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/actions.yml | 6 +++--- .github/workflows/nightly.yml | 2 +- .github/workflows/release.yml | 2 +- .github/workflows/scorecard.yml | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml index a80bc1b735..8be69b967b 100644 --- a/.github/workflows/actions.yml +++ b/.github/workflows/actions.yml @@ -26,7 +26,7 @@ jobs: python -m pip install --upgrade pip setuptools echo "::set-output name=dir::$(pip cache dir)" - name: pip cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ${{ steps.pip-cache.outputs.dir }} key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }} @@ -65,7 +65,7 @@ jobs: python -m pip install --upgrade pip setuptools echo "::set-output name=dir::$(pip cache dir)" - name: pip cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ${{ steps.pip-cache.outputs.dir }} key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }} @@ -110,7 +110,7 @@ jobs: python -m pip install --upgrade pip setuptools echo "::set-output name=dir::$(pip cache dir)" - name: pip cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ${{ steps.pip-cache.outputs.dir }} key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }} diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 4ec23461b3..ded0a461b2 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -27,7 +27,7 @@ jobs: python -m pip install --upgrade pip setuptools echo "::set-output name=dir::$(pip cache dir)" - name: pip cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ${{ steps.pip-cache.outputs.dir }} key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index afe38eb519..7a471e938a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,7 +25,7 @@ jobs: python -m pip install --upgrade pip setuptools echo "::set-output name=dir::$(pip cache dir)" - name: pip cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ${{ steps.pip-cache.outputs.dir }} key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }} diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index ff310c9dee..98509aef93 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -45,7 +45,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@c7d193f32edcb7bfad88892161225aeda64e9392 # v4.0.0 + uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0 with: name: SARIF file path: results.sarif @@ -53,6 +53,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@012739e5082ff0c22ca6d6ab32e07c36df03c4a4 # v3.22.12 + uses: github/codeql-action/upload-sarif@b7bf0a3ed3ecfa44160715d7c442788f65f0f923 # v3.23.2 with: sarif_file: results.sarif From 160d2a9506c2f4cb9a841fd3020af9f16a533539 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Wed, 7 Feb 2024 23:26:44 +0000 Subject: [PATCH 02/14] update text model --- .../feature_extractor/clip/clip_text_model.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py index 3665e0b741..345d6a3c2c 100644 --- a/keras_cv/models/feature_extractor/clip/clip_text_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py @@ -44,13 +44,24 @@ def __init__( def call(self, inputs): token_embedding = self.token_embedding(inputs) + input_shape = token_embedding.shape + position_ids = ops.expand_dims( + ops.arange(start=0, stop=input_shape[-1]), axis=0 + ) + position_embeds = ops.take( + self.positional_embedding, indices=position_ids + ) + position_embeds = ops.tile( + position_embeds, repeats=(input_shape[0], 1, 1) + ) encoded_output = self.encoder( - token_embedding + self.positional_embedding + token_embedding + position_embeds ) layer_norm = self.ln_final(encoded_output) indices = ops.expand_dims( ops.cast(ops.argmax(inputs, axis=1), "int32"), axis=-1 ) + print("incides", indices) selected_features = ops.take_along_axis( layer_norm, indices[:, :, None], axis=1 ) From 681120c513931483e7bae48be4da49a920812d5b Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Thu, 8 Feb 2024 02:27:38 +0000 Subject: [PATCH 03/14] update text encoder --- .../feature_extractor/clip/clip_model.py | 6 ----- .../feature_extractor/clip/clip_text_model.py | 25 +++++-------------- 2 files changed, 6 insertions(+), 25 deletions(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_model.py b/keras_cv/models/feature_extractor/clip/clip_model.py index bf0740c4e9..f56a7609e4 100644 --- a/keras_cv/models/feature_extractor/clip/clip_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_model.py @@ -146,12 +146,6 @@ def __init__( self.image_embeddings = None self.text_embeddings = None - def build_attention_mask(self): - mask = ops.ones((self.context_length, self.context_length)) - # Zero out the lower diagonal - mask = ops.triu(mask) - return ops.cast(mask, "float32") - def encode_images(self, image): return self.image_encoder(image) diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py index 345d6a3c2c..efe0ac49cb 100644 --- a/keras_cv/models/feature_extractor/clip/clip_text_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py @@ -29,11 +29,15 @@ def __init__( shape=[self.context_length, transformer_width], name="positional_embedding", ) + mask = ops.ones((self.context_length, self.context_length)) + # Zero out the lower diagonal + mask = ops.triu(mask) + mask = ops.cast(mask, "float32") self.encoder = CLIPEncoder( width=transformer_width, layers=transformer_layers, heads=transformer_heads, - attn_mask=self.build_attention_mask(), + attn_mask=mask, name="clip_encoder", ) self.ln_final = keras.layers.LayerNormalization(name="ln_final") @@ -44,33 +48,16 @@ def __init__( def call(self, inputs): token_embedding = self.token_embedding(inputs) - input_shape = token_embedding.shape - position_ids = ops.expand_dims( - ops.arange(start=0, stop=input_shape[-1]), axis=0 - ) - position_embeds = ops.take( - self.positional_embedding, indices=position_ids - ) - position_embeds = ops.tile( - position_embeds, repeats=(input_shape[0], 1, 1) - ) encoded_output = self.encoder( - token_embedding + position_embeds + token_embedding + self.positional_embedding ) layer_norm = self.ln_final(encoded_output) indices = ops.expand_dims( ops.cast(ops.argmax(inputs, axis=1), "int32"), axis=-1 ) - print("incides", indices) selected_features = ops.take_along_axis( layer_norm, indices[:, :, None], axis=1 ) text_features = self.text_projector(selected_features) output = ops.squeeze(text_features, axis=1) return output - - def build_attention_mask(self): - mask = ops.ones((self.context_length, self.context_length)) - # Zero out the lower diagonal - mask = ops.triu(mask) - return ops.cast(mask, "float32") From df73f2343261d844a0bf3a2c117e0ab03ec25f26 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Thu, 8 Feb 2024 06:12:45 +0000 Subject: [PATCH 04/14] update position embeddings --- .../models/feature_extractor/clip/clip_text_model.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py index efe0ac49cb..b95dd518d4 100644 --- a/keras_cv/models/feature_extractor/clip/clip_text_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py @@ -25,8 +25,9 @@ def __init__( ) self.vocab_size = vocab_size - self.positional_embedding = self.add_weight( - shape=[self.context_length, transformer_width], + self.positional_embedding = keras.layers.Embedding( + self.context_length, + transformer_width, name="positional_embedding", ) mask = ops.ones((self.context_length, self.context_length)) @@ -48,8 +49,10 @@ def __init__( def call(self, inputs): token_embedding = self.token_embedding(inputs) + position_ids = ops.expand_dims(ops.arange(self.context_length, dtype="int32"), 0) + position_embedding = self.positional_embedding(position_ids) encoded_output = self.encoder( - token_embedding + self.positional_embedding + token_embedding + position_embedding ) layer_norm = self.ln_final(encoded_output) indices = ops.expand_dims( From 80bde9c641c4a44f7f8e331e555dc3dda16ae063 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Thu, 8 Feb 2024 06:35:22 +0000 Subject: [PATCH 05/14] update positonal embeddings --- .../models/feature_extractor/clip/clip_text_model.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py index b95dd518d4..41a0f57ca7 100644 --- a/keras_cv/models/feature_extractor/clip/clip_text_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py @@ -49,11 +49,14 @@ def __init__( def call(self, inputs): token_embedding = self.token_embedding(inputs) - position_ids = ops.expand_dims(ops.arange(self.context_length, dtype="int32"), 0) + position_ids = ops.expand_dims( + ops.arange(self.context_length, dtype="int32"), 0 + ) position_embedding = self.positional_embedding(position_ids) - encoded_output = self.encoder( - token_embedding + position_embedding + position_embedding = ops.tile( + position_embedding, repeats=(inputs.shape[0], 1, 1) ) + encoded_output = self.encoder(token_embedding + position_embedding) layer_norm = self.ln_final(encoded_output) indices = ops.expand_dims( ops.cast(ops.argmax(inputs, axis=1), "int32"), axis=-1 From 5f7b23bfcc28764c342be07e747a794c599d2dbd Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Thu, 8 Feb 2024 17:37:28 +0000 Subject: [PATCH 06/14] add attention masks --- .../feature_extractor/clip/clip_encoder.py | 38 ++++++++++++------- .../feature_extractor/clip/clip_model.py | 10 +++-- .../feature_extractor/clip/clip_processor.py | 3 +- .../feature_extractor/clip/clip_text_model.py | 11 +++++- 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_encoder.py b/keras_cv/models/feature_extractor/clip/clip_encoder.py index 653189ca7d..1be8ff1991 100644 --- a/keras_cv/models/feature_extractor/clip/clip_encoder.py +++ b/keras_cv/models/feature_extractor/clip/clip_encoder.py @@ -58,14 +58,23 @@ def __init__( * 0.02 ) - def attention(self, x): + def attention(self, x, attention_mask=None): self.attn_mask = ( ops.cast(self.attn_mask, dtype=x.dtype) if self.attn_mask is not None else None ) + attention_mask = ( + ops.cast(attention_mask, dtype=x.dtype) + if attention_mask is not None + else None + ) - return self.attn(x, attention_mask=self.attn_mask) + return self.attn( + x, + attention_mask=attention_mask, + causal_attention_mask=self.attn_mask, + ) def build(self, input_shape): super().build(input_shape) @@ -93,8 +102,8 @@ def build(self, input_shape): ) self.ln_2 = keras.layers.LayerNormalization(epsilon=1e-5, name="ln_2") - def call(self, x): - x = x + self.attention(self.ln_1(x)) + def call(self, x, attention_mask=None): + x = x + self.attention(self.ln_1(x), attention_mask=attention_mask) x = x + self.mlp(self.ln_2(x)) return x @@ -109,20 +118,21 @@ def __init__(self, width, layers, heads, attn_mask=None, **kwargs): self.layers = layers self.heads = heads self.attn_mask = attn_mask - self.resblocks = keras.Sequential( - [ - ResidualAttention( - self.width, self.heads, self.layers, self.attn_mask - ) - for _ in range(self.layers) - ] - ) + self.resblocks = [ + ResidualAttention( + self.width, self.heads, self.layers, self.attn_mask + ) + for _ in range(self.layers) + ] def build(self, input_shape): super().build(input_shape) + self.resblocks.build() - def call(self, x): - return self.resblocks(x) + def call(self, x, attention_mask=None): + for block in self.resblocks: + x = block(x, attention_mask=attention_mask) + return x def compute_output_shape(self, inputs_shape): return inputs_shape diff --git a/keras_cv/models/feature_extractor/clip/clip_model.py b/keras_cv/models/feature_extractor/clip/clip_model.py index f56a7609e4..04dd816f07 100644 --- a/keras_cv/models/feature_extractor/clip/clip_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_model.py @@ -149,12 +149,14 @@ def __init__( def encode_images(self, image): return self.image_encoder(image) - def encode_text(self, text): - return self.text_encoder(text) + def encode_text(self, text, attention_mask=None): + return self.text_encoder(text, attention_mask=attention_mask) - def call(self, image, text): + def call(self, image, text, attention_mask=None): self.image_embeddings = self.encode_images(image) - self.text_embeddings = self.encode_text(text) + self.text_embeddings = self.encode_text( + text, attention_mask=attention_mask + ) normalize_image_features = keras.ops.sqrt( keras.ops.sum( keras.ops.power(self.image_embeddings, 2), keepdims=True diff --git a/keras_cv/models/feature_extractor/clip/clip_processor.py b/keras_cv/models/feature_extractor/clip/clip_processor.py index 5505e87f11..80183fcb0e 100644 --- a/keras_cv/models/feature_extractor/clip/clip_processor.py +++ b/keras_cv/models/feature_extractor/clip/clip_processor.py @@ -109,12 +109,11 @@ def process_texts(self, texts, context_length: int = 77): texts = [texts] def pack_tokens(text): - tok, _ = self.packer( + return self.packer( self.tokenizer(text), sequence_length=context_length, add_start_value=True, add_end_value=True, ) - return tok return pack_tokens(texts) diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py index 41a0f57ca7..bb096c805d 100644 --- a/keras_cv/models/feature_extractor/clip/clip_text_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py @@ -47,7 +47,7 @@ def __init__( embed_dim, name="text_projector", use_bias=False ) - def call(self, inputs): + def call(self, inputs, attention_mask=None): token_embedding = self.token_embedding(inputs) position_ids = ops.expand_dims( ops.arange(self.context_length, dtype="int32"), 0 @@ -56,7 +56,14 @@ def call(self, inputs): position_embedding = ops.tile( position_embedding, repeats=(inputs.shape[0], 1, 1) ) - encoded_output = self.encoder(token_embedding + position_embedding) + attention_mask = ops.cast(attention_mask, dtype="float32") + expanded_mask = ops.tile( + attention_mask[:, None, None, :], (1, 1, self.context_length, 1) + ) + expanded_mask = (1.0 - expanded_mask) * (-1e8) + encoded_output = self.encoder( + token_embedding + position_embedding, attention_mask=expanded_mask + ) layer_norm = self.ln_final(encoded_output) indices = ops.expand_dims( ops.cast(ops.argmax(inputs, axis=1), "int32"), axis=-1 From 7530eed43c765e35fbe5e7529a1a5ea460956156 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Thu, 8 Feb 2024 19:15:05 +0000 Subject: [PATCH 07/14] update expanded mask --- keras_cv/models/feature_extractor/clip/clip_text_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py index bb096c805d..eb2287ee80 100644 --- a/keras_cv/models/feature_extractor/clip/clip_text_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py @@ -60,7 +60,7 @@ def call(self, inputs, attention_mask=None): expanded_mask = ops.tile( attention_mask[:, None, None, :], (1, 1, self.context_length, 1) ) - expanded_mask = (1.0 - expanded_mask) * (-1e8) + # expanded_mask = (1.0 - expanded_mask) * (-1e8) encoded_output = self.encoder( token_embedding + position_embedding, attention_mask=expanded_mask ) From 0211bd47f8267c68a685f785eaeebf56405524f4 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Thu, 8 Feb 2024 19:24:46 +0000 Subject: [PATCH 08/14] revert previous commit --- keras_cv/models/feature_extractor/clip/clip_text_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py index eb2287ee80..bb096c805d 100644 --- a/keras_cv/models/feature_extractor/clip/clip_text_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py @@ -60,7 +60,7 @@ def call(self, inputs, attention_mask=None): expanded_mask = ops.tile( attention_mask[:, None, None, :], (1, 1, self.context_length, 1) ) - # expanded_mask = (1.0 - expanded_mask) * (-1e8) + expanded_mask = (1.0 - expanded_mask) * (-1e8) encoded_output = self.encoder( token_embedding + position_embedding, attention_mask=expanded_mask ) From d488b7523af0fea4ac94cade1f84b763548b2a7e Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Thu, 8 Feb 2024 19:33:48 +0000 Subject: [PATCH 09/14] change causal masks --- keras_cv/models/feature_extractor/clip/clip_text_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py index bb096c805d..d98e13160b 100644 --- a/keras_cv/models/feature_extractor/clip/clip_text_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py @@ -34,6 +34,7 @@ def __init__( # Zero out the lower diagonal mask = ops.triu(mask) mask = ops.cast(mask, "float32") + mask = (1.0 - mask) * (-1e8) self.encoder = CLIPEncoder( width=transformer_width, layers=transformer_layers, From d9d126430d5e1ae3ae18291ce60f75c2b7d2532a Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Thu, 8 Feb 2024 19:42:00 +0000 Subject: [PATCH 10/14] undo previous commit --- keras_cv/models/feature_extractor/clip/clip_text_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py index d98e13160b..bb096c805d 100644 --- a/keras_cv/models/feature_extractor/clip/clip_text_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py @@ -34,7 +34,6 @@ def __init__( # Zero out the lower diagonal mask = ops.triu(mask) mask = ops.cast(mask, "float32") - mask = (1.0 - mask) * (-1e8) self.encoder = CLIPEncoder( width=transformer_width, layers=transformer_layers, From 64d66b54d172e16235e07993779086155921059a Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Thu, 8 Feb 2024 23:27:34 +0000 Subject: [PATCH 11/14] update attention masks --- keras_cv/models/feature_extractor/clip/clip_encoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_encoder.py b/keras_cv/models/feature_extractor/clip/clip_encoder.py index 1be8ff1991..d1146985c3 100644 --- a/keras_cv/models/feature_extractor/clip/clip_encoder.py +++ b/keras_cv/models/feature_extractor/clip/clip_encoder.py @@ -69,11 +69,11 @@ def attention(self, x, attention_mask=None): if attention_mask is not None else None ) + mask = ops.add(self.attn_mask, attention_mask) return self.attn( x, - attention_mask=attention_mask, - causal_attention_mask=self.attn_mask, + attention_mask=mask, ) def build(self, input_shape): From de0be1907fb66ac68e96b97548ef799bd34afebe Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Thu, 8 Feb 2024 23:51:02 +0000 Subject: [PATCH 12/14] update clip encoder --- .../models/feature_extractor/clip/clip_encoder.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_encoder.py b/keras_cv/models/feature_extractor/clip/clip_encoder.py index d1146985c3..8be0cd05fa 100644 --- a/keras_cv/models/feature_extractor/clip/clip_encoder.py +++ b/keras_cv/models/feature_extractor/clip/clip_encoder.py @@ -59,17 +59,18 @@ def __init__( ) def attention(self, x, attention_mask=None): - self.attn_mask = ( + mask = ( ops.cast(self.attn_mask, dtype=x.dtype) if self.attn_mask is not None else None ) - attention_mask = ( - ops.cast(attention_mask, dtype=x.dtype) - if attention_mask is not None - else None - ) - mask = ops.add(self.attn_mask, attention_mask) + if attention_mask is not None: + attention_mask = ( + ops.cast(attention_mask, dtype=x.dtype) + if attention_mask is not None + else None + ) + mask = ops.add(self.attn_mask, attention_mask) return self.attn( x, From 4b8c1efd8a9f5b8e03d38dd297797449f81ec82e Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Fri, 9 Feb 2024 00:48:53 +0000 Subject: [PATCH 13/14] add print statements --- keras_cv/models/feature_extractor/clip/clip_text_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py index bb096c805d..715c3a8dd3 100644 --- a/keras_cv/models/feature_extractor/clip/clip_text_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py @@ -64,6 +64,7 @@ def call(self, inputs, attention_mask=None): encoded_output = self.encoder( token_embedding + position_embedding, attention_mask=expanded_mask ) + print("encoded_output", encoded_output) layer_norm = self.ln_final(encoded_output) indices = ops.expand_dims( ops.cast(ops.argmax(inputs, axis=1), "int32"), axis=-1 @@ -71,6 +72,7 @@ def call(self, inputs, attention_mask=None): selected_features = ops.take_along_axis( layer_norm, indices[:, :, None], axis=1 ) + print("pooler output", selected_features) text_features = self.text_projector(selected_features) output = ops.squeeze(text_features, axis=1) return output From 54f02e81b0bec5035b004673c5a4c9371286a339 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Fri, 9 Feb 2024 01:31:54 +0000 Subject: [PATCH 14/14] update the pooler output --- keras_cv/models/feature_extractor/clip/clip_text_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py index 715c3a8dd3..fc765fc67b 100644 --- a/keras_cv/models/feature_extractor/clip/clip_text_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py @@ -67,7 +67,7 @@ def call(self, inputs, attention_mask=None): print("encoded_output", encoded_output) layer_norm = self.ln_final(encoded_output) indices = ops.expand_dims( - ops.cast(ops.argmax(inputs, axis=1), "int32"), axis=-1 + ops.cast(ops.argmax(inputs, axis=-1), "int32"), axis=-1 ) selected_features = ops.take_along_axis( layer_norm, indices[:, :, None], axis=1