diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..198ee90 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +# EditorConfig is awesome: https://EditorConfig.org + +# top-most EditorConfig file +root = true + +[*] +indent_style = space +indent_size = 4 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = false \ No newline at end of file diff --git a/README.md b/README.md index 32c1176..2140610 100644 --- a/README.md +++ b/README.md @@ -1,63 +1,78 @@ # Containerize an existing conda environment -I use conda environments for working on data analysis projects. -Sometimes I need to revert to install using `pip` or `R`'s -`install.packages` if a package is not on bioconda or conda-forge. +I use conda environments for working on data analysis projects. +Sometimes I need to revert to install using `pip` or `R`'s +`install.packages` if a package is not on bioconda or conda-forge. This makes it very hard to reproduce the environment, and therefore, the analysis, on another system. Even pure conda environments stored as an `environment.yml` file tend to [break after a -while](https://github.com/conda/conda/issues/9257). +while](https://github.com/conda/conda/issues/9257). Using the instructions below allows to package an existing environment into a Docker or Singularity container which should be more portable and can also easily be integrated into a [fully reproducible data analysis workflow](https://grst.github.io/bioinformatics/2019/12/23/reportsrender.html) -based on e.g. [Nextflow](https://www.nextflow.io/). +based on e.g. [Nextflow](https://www.nextflow.io/). -## Prerequisites +## Usage - * [conda-pack](https://conda.github.io/conda-pack/) - * either Docker, Podman or Singularity - * source conda environment needs to be on a linux x64 machine. +``` +usage: conda_to_singularity.py [-h] [--template TEMPLATE] CONDA_ENV OUTPUT_CONTAINER -## Usage +Convert a conda env to a singularity container. -1. Clone this repository (retrieve `Dockerfile`/`Singularity`) +positional arguments: + CONDA_ENV Absolute path to the conda enviornment. Must be exactely the path as it shows up in `conda env list`, not a symbolic link to it, nor a realpath. + OUTPUT_CONTAINER Output path where the singularity container will be safed. -``` -git clone git@github.com:grst/containerize-conda.git -cd containerize-conda +optional arguments: + -h, --help show this help message and exit + --template TEMPLATE Path to a Singularity template file. Must contain a `{conda_env}` placeholder. If not specified, uses the default template shipped with this script. ``` -2. Pack the environment +For example ``` -conda-pack -n -o packed_environment.tar.gz +conda_to_singularity.py /home/sturm/.conda/envs/whatever whatever.sif ``` -3. Build the container +By default, the image will be based on CentOS 7. If you want a different base image, +you can modify `Singularity.template`, and specify it with the `--template` argument. -``` -# With singularity -singularity build --fakeroot Singularity -# With Docker -docker build . -t +## How it works + +Conda envs cannot simply be "moved" as some paths are hardcoded into the environment. +I previously applied `conda-pack` to solve this issue, which works fine in most cases +but breaks in some (especially for old environments that have a long history +of manually installing stuff through R or pip). + +This is an other appraoch where the issue is solved by copying the conda environment +with its full absolute path to the container and append a line to the Singularity environment +file that activates the conda environment from that path once the container is started: -# With Podman/Buildah -podman build . -t +``` +echo "source /opt/conda/bin/activate {conda_env}" >>$SINGULARITY_ENVIRONMENT ``` -## How it works -Conda environment can't be just "moved" to another location, as some paths are -hardcoded into the environment. `conda-pack` takes care of replacing these paths -back to placeholders and creates a `.tar.gz` archive that contains the -environment. This environment can be unpacked to another machine (or, in our -case, a container). Running `conda-unpack` in the environment replaces the -placeholders back to the actual paths matching the new location. +Naively, this could be solved with `%files /path/to/env`, however, this dereferences +all symbolic links, which breaks some environments. Instead, I build a tar archive +that keeps all symbolic links intact *within* the conda environment, but at the +same time include all files that are outside the conda env, but referenced +by a symbolic link. + +I don't have a lot of experience yet if it is really more stable than conda-pack +or just happens to fail in different cases. + +## Where's the conda-pack version? + +This is an updated version of my scripts that works without `conda-pack` and turned out +to work even in cases where the conda-pack variant failed. It works only with Singularity at the moment, though. +If you are looking for the previous scripts based on `conda-pack`, because you need a Docker variant, or they just +work for you, they are in the [conda-pack](conda-pack) folder with a dedicated [README](conda-pack/README.md). + + -## Troubleshooting - * `find . -xtype l` finds broken symbolic links which leads to a failed container creation... diff --git a/Singularity.template b/Singularity.template new file mode 100644 index 0000000..5d2534a --- /dev/null +++ b/Singularity.template @@ -0,0 +1,21 @@ +Bootstrap: yum +OSVersion: 7 +MirrorURL: http://mirror.centos.org/centos-%{{OSVERSION}}/%{{OSVERSION}}/os/$basearch/ +Include: yum + +%files + packed_env.tar /packed_env.tar + +%environment + export NUMBA_CACHE_DIR=/tmp/numba_cache + +%post +# yum install -y kernel-3.10.0-1160.11.1.el7 + yum install -y tar + curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > /install_conda.sh + chmod +x /install_conda.sh + /install_conda.sh -b -p /opt/conda + rm /install_conda.sh + tar xf /packed_env.tar + rm /packed_env.tar + echo "source /opt/conda/bin/activate {conda_env}" >>$SINGULARITY_ENVIRONMENT \ No newline at end of file diff --git a/Dockerfile b/conda-pack/Dockerfile similarity index 100% rename from Dockerfile rename to conda-pack/Dockerfile diff --git a/conda-pack/README.md b/conda-pack/README.md new file mode 100644 index 0000000..6acdc2f --- /dev/null +++ b/conda-pack/README.md @@ -0,0 +1,45 @@ +## Prerequisites + + * [conda-pack](https://conda.github.io/conda-pack/) + * either Docker, Podman or Singularity + * source conda environment needs to be on a linux x64 machine. + +## Usage + +1. Clone this repository (retrieve `Dockerfile`/`Singularity`) + +``` +git clone git@github.com:grst/containerize-conda.git +cd containerize-conda/conda-pack +``` + +2. Pack the environment + +``` +conda-pack -n -o packed_environment.tar.gz +``` + +3. Build the container + +``` +# With singularity +singularity build --fakeroot Singularity + +# With Docker +docker build . -t + +# With Podman/Buildah +podman build . -t +``` + +## How it works +Conda environment can't be just "moved" to another location, as some paths are +hardcoded into the environment. `conda-pack` takes care of replacing these paths +back to placeholders and creates a `.tar.gz` archive that contains the +environment. This environment can be unpacked to another machine (or, in our +case, a container). Running `conda-unpack` in the environment replaces the +placeholders back to the actual paths matching the new location. + +## Troubleshooting + + * `find . -xtype l` finds broken symbolic links which leads to a failed container creation... \ No newline at end of file diff --git a/Singularity b/conda-pack/Singularity similarity index 100% rename from Singularity rename to conda-pack/Singularity diff --git a/conda_to_singularity.sh b/conda-pack/conda_to_singularity.sh similarity index 100% rename from conda_to_singularity.sh rename to conda-pack/conda_to_singularity.sh diff --git a/conda_to_singularity.py b/conda_to_singularity.py new file mode 100755 index 0000000..a3ab68a --- /dev/null +++ b/conda_to_singularity.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python + +import tempfile +from subprocess import call +from os.path import abspath, join as join_path, dirname, realpath +import argparse +from pathlib import Path +from time import sleep + + +def _generate_file_list(conda_env, filelist_path): + """ + Generate list of all files in the conda env. + + We need to include all files as absolute paths, and also the symbolic links they are pointing to (which + might be outside the environment). To this end, the first `find` lists all files in the conda env. + The second find finds the files the links point to. Using sort/uniq removes the duplicates files. + + TODO: While this covered all the cases I encountered so far, I believe this would still fail if there were nested + symbolic links outside the repository. + """ + command = f"""\ + #!/bin/bash + set -o pipefail + + cat <(find {conda_env}) <(find -L {conda_env} -exec readlink -f "{{}}" ";") | \\ + sort | \\ + uniq > {filelist_path} + """ + call(command, shell=True, executable="/bin/bash") + + +def _build_tar_archive(filelist_path, archive_path): + """Build a tar archive from the filelist""" + call(["tar", "cf", archive_path, "-T", filelist_path]) + + +def _build_container(tmpdir, singularity_file, output_path): + """ + Actually builds the container. + + tmpdir is the temporary directory that already contains the tar archive. + """ + call( + [ + "singularity", + "build", + "--fakeroot", + "--force", + output_path, + singularity_file, + ], + cwd=tmpdir, + ) + + +def conda2singularity(conda_env, output_path, template_path): + output_path = abspath(output_path) + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + print(f"Using temporary directory: {tmpdir}") + singularity_file_path = tmpdir / "Singularity" + filelist_path = tmpdir / "filelist.txt" + tar_archive_path = tmpdir / "packed_env.tar" + + # Read Singularity template file + with open(template_path) as f: + template = "".join(f.readlines()) + template = template.format(conda_env=conda_env) + + # Write formatted template file + with open(singularity_file_path, "w") as f: + f.write(template) + + print("Building file list...") + _generate_file_list(conda_env, filelist_path) + + # We are using a tar archive as tar is the only way of getting the symbolic links into the singularity + # container as symbolic links. + print("Building tar archive...") + _build_tar_archive(filelist_path, tar_archive_path) + + print("Building singularity container...") + _build_container(tmpdir, singularity_file_path, output_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Convert a conda env to a singularity container." + ) + parser.add_argument( + "CONDA_ENV", + help="Absolute path to the conda enviornment. Must be exactely the path as it shows up in `conda env list`, not a symbolic link to it, nor a realpath. ", + ) + parser.add_argument( + "OUTPUT_CONTAINER", + help="Output path where the singularity container will be safed.", + ) + parser.add_argument( + "--template", + help="Path to a Singularity template file. Must contain a `{conda_env}` placeholder. If not specified, uses the default template shipped with this script.", + default=join_path(dirname(realpath(__file__)), "Singularity.template"), + ) + args = parser.parse_args() + conda2singularity(args.CONDA_ENV, args.OUTPUT_CONTAINER, args.template)