From ba1db83d8db9ca9c7dd66e9732749365ddc0e4d9 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 18 Dec 2024 18:16:45 -0800 Subject: [PATCH 1/3] Remove tensorflow-text from source build --- .github/container/Dockerfile.maxtext | 45 +-------------------- .github/container/Dockerfile.pax | 40 +------------------ .github/container/Dockerfile.t5x | 46 ---------------------- .github/container/manifest.yaml | 2 +- .github/workflows/_ci.yaml | 3 -- rosetta/Dockerfile.gemma | 40 +------------------ rosetta/rosetta/projects/maxtext/README.md | 2 +- 7 files changed, 6 insertions(+), 172 deletions(-) diff --git a/.github/container/Dockerfile.maxtext b/.github/container/Dockerfile.maxtext index 87b73efcd..b8e7ff025 100644 --- a/.github/container/Dockerfile.maxtext +++ b/.github/container/Dockerfile.maxtext @@ -2,57 +2,16 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax ARG URLREF_MAXTEXT=https://github.com/google/maxtext.git#main -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG SRC_PATH_MAXTEXT=/opt/maxtext -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text - -############################################################################### -## build tensorflow-text and lingvo, which do not have working arm64 pip wheels -############################################################################### - -ARG BASE_IMAGE -FROM ${BASE_IMAGE} as wheel-builder - -#------------------------------------------------------------------------------ -# build tensorflow-text from source -#------------------------------------------------------------------------------ - -# Remove TFTEXT build from source when it has py-3.12 wheels for x86/arm64 -FROM wheel-builder as tftext-builder -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT - -RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 -RUN git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} -RUN <<"EOF" bash -exu -o pipefail -cd ${SRC_PATH_TFTEXT} - -# The tftext build script queries GitHub, but these requests are sometimes -# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. -# A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh - -# Newer versions of LLVM make lld's --undefined-version check of lld is strict -# by default (https://reviews.llvm.org/D135402), but the tftext build seems to -# rely on this behavior. -echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - -./oss_scripts/run_build.sh -EOF ############################################################################### ## Download source and add auxiliary scripts ############################################################################### +ARG BASE_IMAGE FROM ${BASE_IMAGE} as mealkit ARG URLREF_MAXTEXT -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG SRC_PATH_MAXTEXT -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text - -# Preserve version information of tensorflow-text -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ -RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-maxtext.in RUN <<"EOF" bash -ex git-clone.sh ${URLREF_MAXTEXT} ${SRC_PATH_MAXTEXT} @@ -86,5 +45,5 @@ RUN pip-finalize.sh WORKDIR ${SRC_PATH_MAXTEXT} -# When tftext and lingvo wheels are published on pypi.org, revert this +# When tftext wheels are published on pypi.org, revert this # Dockerfile to 5c4b687b918e6569bca43758c346ad8e67460154 diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax index 938bd853c..e6667a307 100644 --- a/.github/container/Dockerfile.pax +++ b/.github/container/Dockerfile.pax @@ -3,11 +3,9 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax ARG URLREF_PAXML=https://github.com/google/paxml.git#main ARG URLREF_PRAXIS=https://github.com/google/praxis.git#main -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG URLREF_LINGVO=https://github.com/tensorflow/lingvo.git#master ARG SRC_PATH_PAXML=/opt/paxml ARG SRC_PATH_PRAXIS=/opt/praxis -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text ARG SRC_PATH_LINGVO=/opt/lingvo ############################################################################### @@ -17,32 +15,6 @@ ARG SRC_PATH_LINGVO=/opt/lingvo ARG BASE_IMAGE FROM ${BASE_IMAGE} as wheel-builder -#------------------------------------------------------------------------------ -# build tensorflow-text from source -#------------------------------------------------------------------------------ - -# Remove TFTEXT build from source when it has py-3.12 wheels for x86/arm64 -FROM wheel-builder as tftext-builder -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT -RUN <<"EOF" bash -exu -o pipefail -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 -git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} -cd ${SRC_PATH_TFTEXT} - -# The tftext build script queries GitHub, but these requests are sometimes -# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. -# A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh - -# Newer versions of LLVM make lld's --undefined-version check of lld is strict -# by default (https://reviews.llvm.org/D135402), but the tftext build seems to -# rely on this behavior. -echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - -./oss_scripts/run_build.sh -EOF - #------------------------------------------------------------------------------ # build lingvo #------------------------------------------------------------------------------ @@ -50,13 +22,8 @@ EOF # Remove Lingvo build from source when it has py-3.12 wheels for x86/arm64 FROM wheel-builder as lingvo-builder ARG URLREF_LINGVO -ARG SRC_PATH_TFTEXT ARG SRC_PATH_LINGVO -# Preserve the version of tensorflow-text -COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ - ENV USE_BAZEL_VERSION=7.1.2 # build lingvo @@ -92,7 +59,6 @@ fi pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 /opt/tensorflow_text*.whl for pattern in \ "s|tensorflow=|#tensorflow=|g" \ - "s|tensorflow-text=|#tensorflow-text=|g" \ "s|dataclasses=|#dataclasses=|g" \ "s|==.*||g" \ ; do @@ -101,7 +67,7 @@ done # Lingvo support only python < 3.12, so we hack it and update dependencies # to be able to build for py-3.12 for pattern in \ - "s|tensorflow-text~=2.13.0|tensorflow-text~=2.18.0|g" \ + "s|tensorflow-text~=2.13.0|tensorflow-text~=2.18.1|g" \ "s|tensorflow~=2.13.0|tensorflow~=2.18.0|g" \ "s|python_requires='>=3.8,<3.11'|python_requires='>=3.8,<3.13'|" \ ; do @@ -128,16 +94,12 @@ ARG URLREF_PAXML ARG URLREF_PRAXIS ARG SRC_PATH_PAXML ARG SRC_PATH_PRAXIS -ARG SRC_PATH_TFTEXT # Preserve version information of tensorflow-text and lingvo COPY --from=lingvo-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*-linux*.whl /opt/ RUN echo "lingvo @ file://$(ls /opt/lingvo*.whl)" >> /opt/pip-tools.d/requirements-paxml.in -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ -RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-paxml.in - # paxml + praxis RUN <<"EOF" bash -ex echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/requirements-paxml.in diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index ea4bbf2ec..1568ff559 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -3,64 +3,18 @@ # docker buildx build -f Dockerfile.t5x --tag t5x --build-arg BASE_IMAGE=ghcr.io/nvidia/jax:mealkit-2024-01-22 . ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG URLREF_T5X=https://github.com/google-research/t5x.git#main ARG URLREF_AIRIO=https://github.com/google/airio.git#main -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text ARG SRC_PATH_T5X=/opt/t5x ARG SRC_PATH_AIRIO=/opt/airio - -############################################################################### -## build several packages which do not have working arm64 pip wheels -############################################################################### - -ARG BASE_IMAGE -FROM ${BASE_IMAGE} as wheel-builder - -#------------------------------------------------------------------------------ -# build tensorflow-text from source -#------------------------------------------------------------------------------ -FROM wheel-builder as tftext-builder -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT - -RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 -RUN <<"EOF" bash -exu -o pipefail -git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} -cd ${SRC_PATH_TFTEXT} - -# The tftext build script queries GitHub, but these requests are sometimes -# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. -# A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh - -# Newer versions of LLVM make lld's --undefined-version check of lld is strict -# by default (https://reviews.llvm.org/D135402), but the tftext build seems to -# rely on this behavior. -echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - -./oss_scripts/run_build.sh -EOF - - -############################################################################### -## T5X -############################################################################### - ARG BASE_IMAGE FROM ${BASE_IMAGE} AS mealkit ARG URLREF_T5X ARG URLREF_AIRIO -ARG SRC_PATH_TFTEXT ARG SRC_PATH_T5X ARG SRC_PATH_AIRIO -# Preserve version information of tensorflow-text -COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ -RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-t5x.in - RUN <<"EOF" bash -ex # 1. Fetch T5X git-clone.sh "${URLREF_T5X}" "${SRC_PATH_T5X}" diff --git a/.github/container/manifest.yaml b/.github/container/manifest.yaml index b9c06e2e6..42d899398 100644 --- a/.github/container/manifest.yaml +++ b/.github/container/manifest.yaml @@ -63,7 +63,7 @@ tensorflow-text: # Used only in ARM pax and t5x builds url: https://github.com/tensorflow/text.git tracking_ref: master - latest_verified_commit: 1779b3ae16f7bd287c4edcf66d62208dc63256f3 + latest_verified_commit: d605a8d44b4a26d3426fe251ef028444f932b9a2 mode: git-clone pydantic: version: X.Y.Z diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index b4f3b8143..c2b4cb4ee 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -111,7 +111,6 @@ jobs: DOCKERFILE: .github/container/Dockerfile.maxtext EXTRA_BUILD_ARGS: | URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} secrets: inherit build-levanter: @@ -143,7 +142,6 @@ jobs: DOCKERFILE: .github/container/Dockerfile.t5x EXTRA_BUILD_ARGS: | URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} secrets: inherit @@ -161,7 +159,6 @@ jobs: EXTRA_BUILD_ARGS: | URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} - URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }} secrets: inherit diff --git a/rosetta/Dockerfile.gemma b/rosetta/Dockerfile.gemma index e7db16dcc..4a0ba2965 100644 --- a/rosetta/Dockerfile.gemma +++ b/rosetta/Dockerfile.gemma @@ -11,40 +11,7 @@ ARG URLREF_FLAXFORMER=https://github.com/google/flaxformer.git#main ARG SRC_PATH_FLAXFORMER=/opt/flaxformer ARG URLREF_PANOPTICAPI=https://github.com/akolesnikoff/panopticapi.git#mute ARG SRC_PATH_PANOPTICAPI=/opt/panopticapi -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text -############################################################################### -## Build several packages which do not have working amd64/arm64 pip wheels -############################################################################### - -ARG BASE_IMAGE -FROM ${BASE_IMAGE} as wheel-builder - -#------------------------------------------------------------------------------ -# build tensorflow-text from source -#------------------------------------------------------------------------------ -FROM wheel-builder as tftext-builder -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT - -RUN <<"EOF" bash -exu -o pipefail -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 -git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} -cd ${SRC_PATH_TFTEXT} - -# The tftext build script queries GitHub, but these requests are sometimes -# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. -# A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh - -# Newer versions of LLVM make lld's --undefined-version check of lld is strict -# by default (https://reviews.llvm.org/D135402), but the tftext build seems to -# rely on this behavior. -echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - -./oss_scripts/run_build.sh -EOF ############################################################################### ## Download source and add auxiliary scripts @@ -62,11 +29,6 @@ ARG URLREF_FLAXFORMER ARG SRC_PATH_FLAXFORMER ARG URLREF_PANOPTICAPI ARG SRC_PATH_PANOPTICAPI -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT - -COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ RUN <<"EOF" bash -ex git-clone.sh ${URLREF_GEMMA} ${SRC_PATH_GEMMA} @@ -93,7 +55,7 @@ optax protobuf tfds-nightly tensorflow -tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl) +tensorflow-text tensorflow-gan " >> /opt/pip-tools.d/requirements-gemma.in EOF diff --git a/rosetta/rosetta/projects/maxtext/README.md b/rosetta/rosetta/projects/maxtext/README.md index 2320a7ed9..44baa19ef 100644 --- a/rosetta/rosetta/projects/maxtext/README.md +++ b/rosetta/rosetta/projects/maxtext/README.md @@ -93,7 +93,7 @@ We have run some intial performance and functionality tests with [LLaMA2-7B](htt Please refer to the [example run script](scripts/example_slurm.sub) for more details. We will continue to add more models and associated performance metrics. # Notes -1. The only changes we need to support multiprocessing is to pin tensorflow and tensorflow-text to 2.13.0 version. +1. The only changes we need to support multiprocessing is to pin tensorflow and tensorflow-text to 2.18.0 version or higher. 2. In order to remove extra copies introduced by DUS (dynamic update slice) when used in conjunction with custom NVIDIA kernels (like cuBLAS for GEMMs), the `--xla_gpu_enable_custom_fusions` and `--xla_gpu_enable_address_computation_fusion` flags were introduced. However, the current XLA has some limitation and sometimes using these flags lead to error. So, in this release, it is advised to turn off these two flags: - --xla_gpu_enable_custom_fusions=false - --xla_gpu_enable_address_computation_fusion=false From 6e5181cbf581d6d11ceb125c02a561f30061c1ca Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 18 Dec 2024 20:45:20 -0800 Subject: [PATCH 2/3] Remove tensorflow-text from source build --- .github/container/Dockerfile.pax | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax index e6667a307..50be1741e 100644 --- a/.github/container/Dockerfile.pax +++ b/.github/container/Dockerfile.pax @@ -56,7 +56,7 @@ EOFINNER fi -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 /opt/tensorflow_text*.whl +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 for pattern in \ "s|tensorflow=|#tensorflow=|g" \ "s|dataclasses=|#dataclasses=|g" \ From b9b0aee1dbb5f181b380196c74c56c3e4ff2d385 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 20 Dec 2024 15:25:23 -0700 Subject: [PATCH 3/3] Remove duplicated BASE_IMAGE in MaxText docker file --- .github/container/Dockerfile.maxtext | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/container/Dockerfile.maxtext b/.github/container/Dockerfile.maxtext index b8e7ff025..d19905724 100644 --- a/.github/container/Dockerfile.maxtext +++ b/.github/container/Dockerfile.maxtext @@ -8,7 +8,6 @@ ARG SRC_PATH_MAXTEXT=/opt/maxtext ## Download source and add auxiliary scripts ############################################################################### -ARG BASE_IMAGE FROM ${BASE_IMAGE} as mealkit ARG URLREF_MAXTEXT ARG SRC_PATH_MAXTEXT