diff --git a/hot3d/clips/bop_format_converters/README.md b/hot3d/clips/bop_format_converters/README.md
new file mode 100755
index 0000000..1841dbf
--- /dev/null
+++ b/hot3d/clips/bop_format_converters/README.md
@@ -0,0 +1,48 @@
+# Scripts to convert HOT3D dataset from its native format to the BOP format.
+
+### Set Environment Variables
+
+Before running the scripts, set the following environment variables:
+
+- `HOT3D_DIR`: Path to the HOT3D dataset directory. Converted data will be saved to the same folder.
+
+```bash
+export HOT3D_DIR=<PATH_TO_HOT3D_DATASET>
+```
+
+### Convert the object models from HOT3D format to BOP format
+
+To convert (full) models:
+
+```bash
+python hot3d_models_to_bop.py --input-gltf-dir $HOT3D_DIR/object_models --output-bop-dir $HOT3D_DIR/models
+```
+
+To convert eval models:
+
+```bash
+python hot3d_models_eval_to_bop.py --input-gltf-dir $HOT3D_DIR/object_models_eval --output-bop-dir $HOT3D_DIR/models_eval
+```
+
+Copy the models info from both models and models_eval to the same directory:
+
+```bash
+cp $HOT3D_DIR/object_models/models_info.json $HOT3D_DIR/models/models_info.json
+cp $HOT3D_DIR/object_models_eval/models_info.json $HOT3D_DIR/models_eval/models_info.json
+```
+
+### Convert HOT3D clips to BOP format
+
+To convert HOT3D clips to BOP format, run the following command:
+
+Parameters:
+- --split: Options are "train_aria", "train_quest3", "test_aria", or "test_quest3"
+- --num-threads: Optional, with a default of 4. You can use 4 or 8 threads for better performance.
+
+```bash
+# converted data to be saved to $HOT3D_DIR/<SPLIT_NAME>_scenewise
+python hot3d_clips_to_bop_scenewise.py \
+  --hot3d-dataset-path $HOT3D_DIR \
+  --split <SPLIT_NAME> \
+  --num-threads <NUM_THREADS>
+```
diff --git a/hot3d/clips/bop_format_converters/hot3d_clips_to_bop_scenewise.py b/hot3d/clips/bop_format_converters/hot3d_clips_to_bop_scenewise.py
new file mode 100644
index 0000000..ccfc3ab
--- /dev/null
+++ b/hot3d/clips/bop_format_converters/hot3d_clips_to_bop_scenewise.py
@@ -0,0 +1,401 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script converts the Hot3D-Clips dataset used for the BOP challenge to the BOP format.
+NOTE: the BOP format was updated from its classical format to a new format.
+      The classical format had one main modality (rgb or gray) and depth.
+      The new format can have multiple modalities (rgb, gray1, gray2) and no depth.
+"""
+
+import argparse
+import json
+import multiprocessing
+import os
+
+import sys
+import tarfile
+
+import cv2
+import numpy as np
+from bop_toolkit_lib import misc
+from PIL import Image
+from tqdm import tqdm
+
+parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.insert(0, parent_dir)
+import clip_util
+
+
+def main():
+    # setup args
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hot3d-dataset-path", required=True, type=str)
+    # BOP dataset split name
+    parser.add_argument("--split", required=True, type=str)
+    # number of threads
+    parser.add_argument("--num-threads", type=int, default=4)
+
+    args = parser.parse_args()
+
+    # if split contains "quest3"
+    if "quest3" in args.split:
+        args.camera_streams_id = ["1201-1", "1201-2"]
+        args.camera_streams_names = ["gray1", "gray2"]
+    elif "aria" in args.split:
+        args.camera_streams_id = ["214-1", "1201-1", "1201-2"]
+        args.camera_streams_names = ["rgb", "gray1", "gray2"]
+    else:
+        print(
+            "Split is neither quest3 nor aria.\n"
+            "There are only 4 split type in Hot3D: train_quest3, test_quest3, train_aria, test_aria."
+        )
+        exit()
+
+    # paths
+    clips_input_dir = os.path.join(args.hot3d_dataset_path, args.split)
+    scenes_output_dir = os.path.join(args.hot3d_dataset_path, args.split + "_scenewise")
+
+    # list all clips names in the dataset
+    split_clips = sorted([p for p in os.listdir(clips_input_dir) if p.endswith(".tar")])
+
+    # create output directory
+    os.makedirs(scenes_output_dir, exist_ok=False)
+
+    # Progress bar setup
+    with tqdm(total=len(split_clips), desc="Processing clips") as pbar:
+        # Use a Pool of 8 processes
+        with multiprocessing.Pool(processes=args.num_threads) as pool:
+            # Use imap_unordered to get results as soon as they're ready
+            for _ in pool.imap_unordered(
+                worker,
+                (
+                    (clip, clips_input_dir, scenes_output_dir, args)
+                    for clip in split_clips
+                ),
+            ):
+                pbar.update(1)
+
+
+def worker(args):
+    clip, clips_input_dir, scenes_output_dir, args = args
+    process_clip(clip, clips_input_dir, scenes_output_dir, args)
+
+
+def process_clip(clip, clips_input_dir, scenes_output_dir, args):
+    # get clip id
+    clip_name = clip.split(".")[0].split("-")[1]
+
+    # extract clip
+    tar = tarfile.open(os.path.join(clips_input_dir, clip), "r")
+
+    # make scene folder and files for the scene
+    scene_output_dir = os.path.join(scenes_output_dir, clip_name)
+    os.makedirs(scene_output_dir, exist_ok=True)
+
+    # make path of folders and folders
+    # eg: STREAM_NAME, mask_STREAM_NAME, mask_visib_STREAM_NAME
+    # also create path for each json file
+    # eg: scene_camera_STREAM_NAME.json, scene_gt_STREAM_NAME.json, scene_gt_info_STREAM_NAME.json
+    # create a dictionary for all camera streams
+    clip_stream_paths = {}
+    for stream_name in args.camera_streams_names:
+        # directories
+        stream_image_dir = os.path.join(scene_output_dir, stream_name)
+        os.makedirs(stream_image_dir, exist_ok=True)
+        clip_stream_paths[stream_name] = stream_image_dir
+        stream_mask_dir = os.path.join(scene_output_dir, f"mask_{stream_name}")
+        os.makedirs(stream_mask_dir, exist_ok=True)
+        clip_stream_paths[f"mask_{stream_name}"] = stream_mask_dir
+        stream_mask_visib_dir = os.path.join(
+            scene_output_dir, f"mask_visib_{stream_name}"
+        )
+        os.makedirs(stream_mask_visib_dir, exist_ok=True)
+        clip_stream_paths[f"mask_visib_{stream_name}"] = stream_mask_visib_dir
+        # json files
+        stream_scene_camera_json_path = os.path.join(
+            scene_output_dir, f"scene_camera_{stream_name}.json"
+        )
+        clip_stream_paths[f"scene_camera_{stream_name}"] = stream_scene_camera_json_path
+        stream_scene_gt_json_path = os.path.join(
+            scene_output_dir, f"scene_gt_{stream_name}.json"
+        )
+        clip_stream_paths[f"scene_gt_{stream_name}"] = stream_scene_gt_json_path
+        stream_scene_gt_info_json_path = os.path.join(
+            scene_output_dir, f"scene_gt_info_{stream_name}.json"
+        )
+        clip_stream_paths[f"scene_gt_info_{stream_name}"] = (
+            stream_scene_gt_info_json_path
+        )
+
+    # make a dict of dicts with stream name as keys
+    scene_camera_data = {}
+    scene_gt_data = {}
+    scene_gt_info_data = {}
+    for stream_name in args.camera_streams_names:
+        # add an empty dict indicating the stream name
+        scene_camera_data[stream_name] = {}
+        scene_gt_data[stream_name] = {}
+        scene_gt_info_data[stream_name] = {}
+
+    # loop over all frames
+    for frame_id in range(clip_util.get_number_of_frames(tar)):
+        frame_key = f"{frame_id:06d}"
+
+        # Load camera parameters.
+        # from FRAME_ID.cameras.json
+        frame_camera = clip_util.load_cameras(tar, frame_key)
+        ## read FRAME_ID.objects.json
+        frame_objects = clip_util.load_object_annotations(tar, frame_key)
+
+        # read calibration json as it is
+        camera_json_file_name = f"{frame_id:06d}.cameras.json"
+        camera_json_file = tar.extractfile(camera_json_file_name)
+        frame_camera_data = json.load(camera_json_file)
+
+        # read FRAME_ID.info.json
+        frame_info_file_name = f"{frame_id:06d}.info.json"
+        frame_info_file = tar.extractfile(frame_info_file_name)
+        frame_info_data = json.load(frame_info_file)
+
+        # loop over all camera streams
+        for stream_index, stream_name in enumerate(args.camera_streams_names):
+            stream_id = args.camera_streams_id[stream_index]
+
+            # load the image corresponding to the stream and frame
+            image = clip_util.load_image(tar, frame_key, stream_id)
+            # if image is rgb (3 channels), convert to BGR
+            if image.ndim == 3:
+                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+            # save the image
+            image_path = os.path.join(
+                clip_stream_paths[stream_name], frame_key + ".jpg"
+            )
+            cv2.imwrite(image_path, image)
+
+            # filling scene_camera.json
+
+            # get T_world_from_camera
+            T_world_from_camera = frame_camera[stream_id].T_world_from_eye
+
+            T_world_to_camera = np.linalg.inv(T_world_from_camera)
+
+            # get camera parameters
+            calibration = frame_camera_data[stream_id]["calibration"]
+
+            # add frame scene_camera data
+            scene_camera_data[stream_name][int(frame_id)] = {
+                "cam_model": calibration,
+                "device": frame_info_data["device"],
+                "image_timestamps_ns": frame_info_data["image_timestamps_ns"][
+                    stream_id
+                ],
+                # "cam_K":  # not used as cam_model exists
+                # "depth_scale":  # also not used
+                # convert translation from meter to mm
+                "cam_R_w2c": T_world_to_camera[:3, :3].flatten().tolist(),
+                "cam_t_w2c": (T_world_to_camera[:3, 3] * 1000).tolist(),
+            }
+
+            # Camera parameters of the current image.
+            # camera_model = frame_camera[stream_id]
+
+            frame_scene_gt_data = []
+            frame_scene_gt_info_data = []
+            # loop with enumerate over all objects in the frame
+            for anno_id, obj_key in enumerate(frame_objects):
+                obj_data = frame_objects[obj_key][0]
+
+                # set objects that are not in the current frame scope to -1 (they probably are visible in other frames)
+                # check this by 2 cases
+                # 1) check if the object is visible in the current stream - stream id in keys of visibilities_modeled
+                # 2) if the RLE mask (list) is empty - this happens with objects with very low visibility (< 0.001)
+                if (
+                    stream_id not in obj_data["visibilities_modeled"]
+                    or not obj_data["masks_amodal"][stream_id]["rle"]
+                ):
+                    # make dummy translation and rotation of -1 for all values
+                    object_frame_scene_gt_anno = {
+                        "obj_id": int(obj_key),
+                        "cam_R_m2c": [-1, -1, -1, -1, -1, -1, -1, -1, -1],
+                        "cam_t_m2c": [-1, -1, -1],
+                    }
+                    object_frame_scene_gt_info_anno = {
+                        "bbox_obj": [-1, -1, -1, -1],
+                        "bbox_visib": [-1, -1, -1, -1],
+                        "px_count_all": 0,
+                        # "px_count_valid": px_count_all,  # excluded as Hot3D is RGB only - TODO check
+                        "px_count_visib": 0,
+                        "visib_fract": 0,
+                    }
+                    # make an empty mask and mask_visib
+                    width = frame_camera_data[stream_id]["calibration"]["image_width"]
+                    height = frame_camera_data[stream_id]["calibration"]["image_height"]
+                    mask = Image.new("L", (width, height), 0)
+                    mask_visib = Image.new("L", (width, height), 0)
+                else:
+                    # bop_id = int(obj_data["object_bop_id"])  # same as obj_key
+
+                    # Transformation from the model to the world space.
+                    T_world_from_model = clip_util.se3_from_dict(
+                        obj_data["T_world_from_object"]
+                    )
+
+                    # get object pose in camera frame
+                    T_camera_from_model = (
+                        np.linalg.inv(T_world_from_camera) @ T_world_from_model
+                    )
+
+                    object_frame_scene_gt_anno = {
+                        "obj_id": int(obj_key),
+                        "cam_R_m2c": T_camera_from_model[:3, :3].flatten().tolist(),
+                        "cam_t_m2c": (T_camera_from_model[:3, 3] * 1000).tolist(),
+                    }
+
+                    # read amodal masks
+                    rle_dict = obj_data["masks_amodal"][stream_id]
+                    if not rle_dict["rle"]:
+                        # if 'rle' is an empty list, continue to the next object
+                        print(
+                            "RLE mask is empty!",
+                            "For scene_id:{}, frame_id: {}, obj_id: {}.".format(
+                                clip_name, frame_id, obj_key
+                            ),
+                            "This case shouldn't happen. Maybe that is an edge case That is not covered here.",
+                            "The process will exit.",
+                        )
+                        exit()
+                    else:
+                        mask = custom_rle_to_mask(
+                            rle_dict["height"], rle_dict["width"], rle_dict["rle"]
+                        )
+                        mask = Image.fromarray(mask * 255)
+                        mask = mask.convert("L")
+
+                    # read modal mask
+                    rle_dict = obj_data["masks_modal"][stream_id]
+                    # if 'rle' is an empty list, make an empty mask
+                    if not rle_dict["rle"]:
+                        mask_visib = Image.new(
+                            "L", (rle_dict["width"], rle_dict["height"]), 0
+                        )
+                    else:
+                        mask_visib = custom_rle_to_mask(
+                            rle_dict["height"], rle_dict["width"], rle_dict["rle"]
+                        )
+                        mask_visib = Image.fromarray(mask_visib * 255)
+                        mask_visib = mask_visib.convert("L")
+
+                    px_count_all = cv2.countNonZero(np.array(mask))
+                    px_count_visib = cv2.countNonZero(np.array(mask_visib))
+                    # visibile fraction
+                    visibilities_modeled = obj_data["visibilities_modeled"][stream_id]
+                    visibilities_predicted = obj_data["visibilities_predicted"][
+                        stream_id
+                    ]
+                    visib_fract = min(visibilities_modeled, visibilities_predicted)
+
+                    bbox_obj = obj_data["boxes_amodal"][stream_id]
+                    # change bbox fro xyxy to xywh
+                    bbox_obj = [
+                        bbox_obj[0],
+                        bbox_obj[1],
+                        bbox_obj[2] - bbox_obj[0],
+                        bbox_obj[3] - bbox_obj[1],
+                    ]
+                    bbox_obj = [int(val) for val in bbox_obj]
+                    # bbox_visib
+                    if px_count_visib > 0:
+                        ys, xs = np.asarray(mask_visib).nonzero()
+                        im_size = mask_visib.size
+                        bbox_visib = misc.calc_2d_bbox(xs, ys, im_size)
+                        bbox_visib = [int(x) for x in bbox_visib]
+                    else:
+                        bbox_visib = [-1, -1, -1, -1]
+                    # add scene_gt_info data
+                    object_frame_scene_gt_info_anno = {
+                        "bbox_obj": bbox_obj,
+                        "bbox_visib": bbox_visib,
+                        "px_count_all": px_count_all,
+                        # "px_count_valid": px_count_all,  # excluded as Hot3D is RGB only - TODO check
+                        "px_count_visib": px_count_visib,
+                        "visib_fract": visib_fract,
+                    }
+
+                anno_id = f"{anno_id:06d}"
+                # save mask FRAME-ID_ANNO-ID.png
+                mask_path = os.path.join(
+                    clip_stream_paths[f"mask_{stream_name}"],
+                    frame_key + "_" + anno_id + ".png",
+                )
+                # save mask
+                mask.save(mask_path)
+                # save mask_visib FRAME-ID_ANNO-ID.png
+                mask_visib_path = os.path.join(
+                    clip_stream_paths[f"mask_visib_{stream_name}"],
+                    frame_key + "_" + anno_id + ".png",
+                )
+                # save mask_visib
+                mask_visib.save(mask_visib_path)
+
+                frame_scene_gt_data.append(object_frame_scene_gt_anno)
+                frame_scene_gt_info_data.append(object_frame_scene_gt_info_anno)
+
+            scene_gt_data[stream_name][int(frame_id)] = frame_scene_gt_data
+            scene_gt_info_data[stream_name][int(frame_id)] = frame_scene_gt_info_data
+
+    # save scene_gt.json, scene_gt_info.json, scene_camera.json for each camera stream
+    for stream_name in args.camera_streams_names:
+        with open(clip_stream_paths[f"scene_camera_{stream_name}"], "w") as f:
+            json.dump(scene_camera_data[stream_name], f, indent=4)
+        with open(clip_stream_paths[f"scene_gt_{stream_name}"], "w") as f:
+            json.dump(scene_gt_data[stream_name], f, indent=4)
+        with open(clip_stream_paths[f"scene_gt_info_{stream_name}"], "w") as f:
+            json.dump(scene_gt_info_data[stream_name], f, indent=4)
+
+
+def custom_rle_to_mask(height, width, rle):
+    """
+    Convert custom RLE (Run-Length Encoding) to a binary mask using vectorized operations.
+
+    Parameters:
+    - height (int): The height of the mask.
+    - width (int): The width of the mask.
+    - rle (list): The custom RLE list [start, length, start, length, ...].
+
+    Returns:
+    - np.ndarray: The binary mask.
+    """
+    # Create an empty mask
+    mask = np.zeros(height * width, dtype=np.uint8)
+
+    # Convert RLE pairs into start and end indices
+    starts = np.array(rle[0::2])
+    lengths = np.array(rle[1::2])
+    ends = starts + lengths
+
+    # Create an array of indices corresponding to the runs
+    run_lengths = np.concatenate(
+        [np.arange(start, end) for start, end in zip(starts, ends)]
+    )
+
+    # Set those indices in the mask to 1
+    mask[run_lengths] = 1
+
+    # Reshape the flat array into a 2D mask
+    return mask.reshape((height, width))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hot3d/clips/bop_format_converters/hot3d_models_eval_to_bop.py b/hot3d/clips/bop_format_converters/hot3d_models_eval_to_bop.py
new file mode 100644
index 0000000..9537db0
--- /dev/null
+++ b/hot3d/clips/bop_format_converters/hot3d_models_eval_to_bop.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script converts the object's eval models from the original GLT format used in Hot3D to the PLY format as in the standard BOP format.
+Note: the models_info.json file should be copied from the HOT3D dataset model directory to the output directory.
+      The data in this native models_info.json file actually contains more data than the standard BOP models_info.json.
+"""
+
+import argparse
+import glob
+import os
+
+import trimesh
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-gltf-dir", required=True, type=str)
+    parser.add_argument("--output-bop-dir", required=True, type=str)
+    args = parser.parse_args()
+
+    os.makedirs(args.output_bop_dir, exist_ok=True)
+
+    mesh_in_paths = sorted(glob.glob(f"{args.input_gltf_dir}/*.glb"))
+
+    for mesh_in_path in mesh_in_paths:
+        print(f"src: {mesh_in_path}")
+        mesh = load_mesh(mesh_in_path)
+
+        # Convert from meters to millimeters.
+        mesh.vertices *= 1000.0
+
+        # save the mesh as a PLY file ascii format
+        mesh_out_path = os.path.join(
+            args.output_bop_dir, os.path.basename(mesh_in_path).replace(".glb", ".ply")
+        )
+        print(f"dst: {mesh_out_path}")
+        ply_file = trimesh.exchange.ply.export_ply(mesh, encoding="ascii")
+        with open(mesh_out_path, "wb") as f:
+            f.write(ply_file)
+
+
+def load_mesh(path: str) -> trimesh.Trimesh:
+    # Load the scene.
+    scene = trimesh.load_mesh(
+        path,
+        process=False,
+        merge_primitives=True,
+        skip_materials=True,
+        maintain_order=True,
+    )
+
+    # Represent the scene by a single mesh.
+    mesh = scene.dump(concatenate=True)
+
+    # Clean the mesh.  # don't use it as it will change indices and normals
+    # mesh.process(validate=True)
+
+    return mesh
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hot3d/clips/bop_format_converters/hot3d_models_to_bop.py b/hot3d/clips/bop_format_converters/hot3d_models_to_bop.py
new file mode 100644
index 0000000..40a6e45
--- /dev/null
+++ b/hot3d/clips/bop_format_converters/hot3d_models_to_bop.py
@@ -0,0 +1,129 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script converts the object's models from the original GLTF format used in Hot3D to the PLY format as in the standard BOP format.
+Note: the models_info.json file should be copied from the HOT3D dataset model directory to the output directory.
+      The data in this native models_info.json file actually contains more data than the standard BOP models_info.json.
+"""
+
+import argparse
+import os
+
+import numpy as np
+import trimesh
+from PIL import Image
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    # add arg gltf dir and output dir
+    parser.add_argument("--input-gltf-dir", required=True, type=str)
+    parser.add_argument("--output-bop-dir", required=True, type=str)
+
+    args = parser.parse_args()
+
+    # make the output directory
+    os.makedirs(args.output_bop_dir, exist_ok=False)
+
+    for gltf_filename in os.listdir(args.input_gltf_dir):
+        if gltf_filename.endswith(".glb"):
+            gltf_filepath = os.path.join(args.input_gltf_dir, gltf_filename)
+            ply_filepath = os.path.join(
+                args.output_bop_dir, gltf_filename.replace(".glb", ".ply")
+            )
+            texture_filepath = os.path.join(
+                args.output_bop_dir, gltf_filename.replace(".glb", ".png")
+            )
+
+            # Save mesh as PLY and texture as PNG
+            save_mesh_as_ply_with_uv_and_texture(
+                gltf_filepath, ply_filepath, texture_filepath
+            )
+
+
+def save_mesh_as_ply_with_uv_and_texture(gltf_filepath, ply_filepath, texture_filepath):
+    # Load the GLTF/GLB file using trimesh
+    scene = trimesh.load(gltf_filepath, process=False, maintain_order=True)
+
+    # Dump the scene to a single mesh
+    mesh = scene.dump(concatenate=True)
+
+    # Extract vertex positions, normals, and UVs
+    vertices = mesh.vertices * 1000.0
+    normals = mesh.vertex_normals
+
+    # Handle cases where UV coordinates might be missing
+    uv = mesh.visual.uv if mesh.visual.uv is not None else np.zeros((len(vertices), 2))
+
+    # Prepare vertex data including UV coordinates
+    vertex_data = np.hstack([vertices, normals, uv])
+
+    # Prepare faces
+    faces = mesh.faces
+
+    # Create a PLY header with texture file comment
+    header = f"""ply
+format ascii 1.0
+comment TextureFile {os.path.basename(texture_filepath)}
+element vertex {len(vertices)}
+property float x
+property float y
+property float z
+property float nx
+property float ny
+property float nz
+property float texture_u
+property float texture_v
+element face {len(faces)}
+property list uchar int vertex_indices
+end_header
+"""
+
+    # Write the PLY file
+    with open(ply_filepath, "w") as ply_file:
+        # Write the header
+        ply_file.write(header)
+
+        # Write the vertex data
+        for v in vertex_data:
+            ply_file.write(f"{v[0]} {v[1]} {v[2]} {v[3]} {v[4]} {v[5]} {v[6]} {v[7]}\n")
+
+        # Write the face data
+        for f in faces:
+            ply_file.write(f"3 {f[0]} {f[1]} {f[2]}\n")
+
+    print(f"Mesh saved as {ply_filepath}")
+
+    # Save the texture as a PNG image
+    if mesh.visual.material.to_simple().image is not None:
+        mesh.visual.material = mesh.visual.material.to_simple()
+        texture_image = mesh.visual.material.image
+
+    else:
+        print("No texture found in the GLTF file, using the main_color instead.")
+        # make an image 2048x2048 with the main color
+        main_color = mesh.visual.material.main_color[0:3]
+        texture_image = np.ones((2048, 2048, 3), dtype=np.uint8) * main_color.astype(
+            np.uint8
+        )
+
+    # Convert the texture to a PIL Image and save as PNG
+    image = Image.fromarray(np.array(texture_image))
+    image.save(texture_filepath)
+    print(f"Texture saved as {texture_filepath}")
+
+
+if __name__ == "__main__":
+    main()