From d34a9b402e9e66806969090c4172c7cf747b1e06 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 23 Nov 2024 19:41:37 +0100 Subject: [PATCH] compact: add --stats option with --stats it will be as slow as before, listing all repo objs. without --stats, it will be faster by using the cached chunks index. --- src/borg/archiver/compact_cmd.py | 63 +++++++++++++------ .../testsuite/archiver/compact_cmd_test.py | 35 ++++++++--- 2 files changed, 71 insertions(+), 27 deletions(-) diff --git a/src/borg/archiver/compact_cmd.py b/src/borg/archiver/compact_cmd.py index 35b08275e3..a89a6edcb1 100644 --- a/src/borg/archiver/compact_cmd.py +++ b/src/borg/archiver/compact_cmd.py @@ -3,7 +3,7 @@ from ._common import with_repository from ..archive import Archive -from ..cache import write_chunkindex_to_repo_cache +from ..cache import write_chunkindex_to_repo_cache, build_chunkindex_from_repo from ..constants import * # NOQA from ..hashindex import ChunkIndex, ChunkIndexEntry from ..helpers import set_ec, EXIT_WARNING, EXIT_ERROR, format_file_size, bin_to_hex @@ -18,7 +18,7 @@ class ArchiveGarbageCollector: - def __init__(self, repository, manifest): + def __init__(self, repository, manifest, *, stats): self.repository = repository assert isinstance(repository, (Repository, RemoteRepository)) self.manifest = manifest @@ -26,17 +26,17 @@ def __init__(self, repository, manifest): self.total_files = None # overall number of source files written to all archives in this repo self.total_size = None # overall size of source file content data written to all archives self.archives_count = None # number of archives + self.stats = stats # compute repo space usage before/after - lists all repo objects, can be slow. @property def repository_size(self): - if self.chunks is None: + if self.chunks is None or not self.stats: return None return sum(entry.size for id, entry in self.chunks.iteritems()) # sum of stored sizes def garbage_collect(self): """Removes unused chunks from a repository.""" logger.info("Starting compaction / garbage collection...") - logger.info("Getting object IDs present in the repository...") self.chunks = self.get_repository_chunks() logger.info("Computing object IDs used by archives...") (self.missing_chunks, self.reappeared_chunks, self.total_files, self.total_size, self.archives_count) = ( @@ -47,20 +47,30 @@ def garbage_collect(self): logger.info("Finished compaction / garbage collection...") def get_repository_chunks(self) -> ChunkIndex: - """Build a dict id -> size of all chunks present in the repository""" - chunks = ChunkIndex() - for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT): - # we add this id to the chunks index (as unused chunk), because - # we do not know yet whether it is actually referenced from some archives. - # we "abuse" the size field here. usually there is the plaintext size, - # but we use it for the size of the stored object here. - chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size) + """return a chunks index""" + if self.stats: # slow method: build a fresh chunks index, with stored chunk sizes. + logger.info("Getting object IDs present in the repository...") + chunks = ChunkIndex() + for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT): + # we add this id to the chunks index (as unused chunk), because + # we do not know yet whether it is actually referenced from some archives. + # we "abuse" the size field here. usually there is the plaintext size, + # but we use it for the size of the stored object here. + chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size) + else: # faster: rely on existing chunks index (with flags F_NONE and size 0). + logger.info("Getting object IDs from cached chunks index...") + chunks = build_chunkindex_from_repo(self.repository, cache_immediately=True) return chunks def save_chunk_index(self): - # write_chunkindex_to_repo now removes all flags and size infos. - # we need this, as we put the wrong size in there. - write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True, delete_other=True) + if self.stats: + # write_chunkindex_to_repo now removes all flags and size infos. + # we need this, as we put the wrong size in there to support --stats computations. + write_chunkindex_to_repo_cache( + self.repository, self.chunks, clear=True, force_write=True, delete_other=True + ) + else: + self.chunks.clear() # we already have updated the repo cache in get_repository_chunks self.chunks = None # nothing there (cleared!) def analyze_archives(self) -> Tuple[Set, Set, int, int, int]: @@ -153,15 +163,18 @@ def report_and_delete(self): logger.info( f"Source data size was {format_file_size(self.total_size, precision=0)} in {self.total_files} files." ) - logger.info(f"Repository size is {format_file_size(repo_size_after, precision=0)} in {count} objects.") - logger.info(f"Compaction saved {format_file_size(repo_size_before - repo_size_after, precision=0)}.") + if self.stats: + logger.info(f"Repository size is {format_file_size(repo_size_after, precision=0)} in {count} objects.") + logger.info(f"Compaction saved {format_file_size(repo_size_before - repo_size_after, precision=0)}.") + else: + logger.info(f"Repository has data stored in {count} objects.") class CompactMixIn: @with_repository(exclusive=True, compatibility=(Manifest.Operation.DELETE,)) def do_compact(self, args, repository, manifest): """Collect garbage in repository""" - ArchiveGarbageCollector(repository, manifest).garbage_collect() + ArchiveGarbageCollector(repository, manifest, stats=args.stats).garbage_collect() def build_parser_compact(self, subparsers, common_parser, mid_common_parser): from ._common import process_epilog @@ -198,6 +211,16 @@ def build_parser_compact(self, subparsers, common_parser, mid_common_parser): might not want to do that unless there are signs of lost archives (e.g. when seeing fatal errors when creating backups or when archives are missing in ``borg repo-list``). + + When giving the ``--stats`` option, borg will internally list all repository + objects to determine their existence AND stored size. It will build a fresh + chunks index from that information and cache it in the repository. For some + types of repositories, this might be very slow. It will tell you the sum of + stored object sizes, before and after compaction. + + Without ``--stats``, borg will rely on the cached chunks index to determine + existing object IDs (but there is no stored size information in the index, + thus it can't compute before/after compaction size statistics). """ ) subparser = subparsers.add_parser( @@ -210,3 +233,7 @@ def build_parser_compact(self, subparsers, common_parser, mid_common_parser): help="compact repository", ) subparser.set_defaults(func=self.do_compact) + + subparser.add_argument( + "-s", "--stats", dest="stats", action="store_true", help="print statistics (might be much slower)" + ) diff --git a/src/borg/testsuite/archiver/compact_cmd_test.py b/src/borg/testsuite/archiver/compact_cmd_test.py index 2edda3bee2..66ae73372b 100644 --- a/src/borg/testsuite/archiver/compact_cmd_test.py +++ b/src/borg/testsuite/archiver/compact_cmd_test.py @@ -1,35 +1,48 @@ +import pytest + from ...constants import * # NOQA from . import cmd, create_src_archive, generate_archiver_tests, RK_ENCRYPTION pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA -def test_compact_empty_repository(archivers, request): +@pytest.mark.parametrize("stats", (True, False)) +def test_compact_empty_repository(archivers, request, stats): archiver = request.getfixturevalue(archivers) cmd(archiver, "repo-create", RK_ENCRYPTION) - output = cmd(archiver, "compact", "-v", exit_code=0) + args = ("-v", "--stats") if stats else ("-v",) + output = cmd(archiver, "compact", *args, exit_code=0) assert "Starting compaction" in output - assert "Repository size is 0 B in 0 objects." in output + if stats: + assert "Repository size is 0 B in 0 objects." in output + else: + assert "Repository has data stored in 0 objects." in output assert "Finished compaction" in output -def test_compact_after_deleting_all_archives(archivers, request): +@pytest.mark.parametrize("stats", (True, False)) +def test_compact_after_deleting_all_archives(archivers, request, stats): archiver = request.getfixturevalue(archivers) cmd(archiver, "repo-create", RK_ENCRYPTION) create_src_archive(archiver, "archive") cmd(archiver, "delete", "-a", "archive", exit_code=0) - output = cmd(archiver, "compact", "-v", exit_code=0) + args = ("-v", "--stats") if stats else ("-v",) + output = cmd(archiver, "compact", *args, exit_code=0) assert "Starting compaction" in output assert "Deleting " in output - assert "Repository size is 0 B in 0 objects." in output + if stats: + assert "Repository size is 0 B in 0 objects." in output + else: + assert "Repository has data stored in 0 objects." in output assert "Finished compaction" in output -def test_compact_after_deleting_some_archives(archivers, request): +@pytest.mark.parametrize("stats", (True, False)) +def test_compact_after_deleting_some_archives(archivers, request, stats): archiver = request.getfixturevalue(archivers) cmd(archiver, "repo-create", RK_ENCRYPTION) @@ -37,8 +50,12 @@ def test_compact_after_deleting_some_archives(archivers, request): create_src_archive(archiver, "archive2") cmd(archiver, "delete", "-a", "archive1", exit_code=0) - output = cmd(archiver, "compact", "-v", exit_code=0) + args = ("-v", "--stats") if stats else ("-v",) + output = cmd(archiver, "compact", *args, exit_code=0) assert "Starting compaction" in output assert "Deleting " in output - assert "Repository size is 0 B in 0 objects." not in output + if stats: + assert "Repository size is 0 B in 0 objects." not in output + else: + assert "Repository has data stored in 0 objects." not in output assert "Finished compaction" in output