From f751090743dbf9f0830739cd62249da7649cac69 Mon Sep 17 00:00:00 2001 From: Giovanni M Guidini <99758426+giovanni-guidini@users.noreply.github.com> Date: Wed, 29 Nov 2023 09:28:10 -0300 Subject: [PATCH] Prep the terrain for reports with label compression. (#188) context: codecov/engineering-team #768 Creates a rollout for label compression. This is gonna help us to test and safely release the feature in the wild. Notice that currently the label compression does nothing. There are comments similar to `TODO: needs shared update`. The update in question is https://github.com/codecov/shared/pull/79 So these changes mostly prep the terrain for future changes that will actually do something, and add the guardrails to avoid issues when deploying. In particular it adds some helper methods to the `ArchiveService`, creates a kinda-stubbed `LabelsIndexService`, passes more data to the `_adjust_sessions` portion of `raw_upload_processor` where most changes will occur. If you're curious as to what the end result will probably look like see https://github.com/codecov/worker/pull/180 --- database/models/reports.py | 2 +- rollouts/__init__.py | 8 ++ services/archive.py | 36 +++++++- services/report/__init__.py | 7 +- services/report/labels_index.py | 46 ++++++++++ services/report/raw_upload_processor.py | 47 +++++++++- .../report/tests/unit/test_labels_index.py | 85 +++++++++++++++++++ services/report/tests/unit/test_sessions.py | 66 +++++++++++++- services/tests/unit/test_archive_service.py | 79 +++++++++++++++++ 9 files changed, 368 insertions(+), 8 deletions(-) create mode 100644 services/report/labels_index.py create mode 100644 services/report/tests/unit/test_labels_index.py diff --git a/database/models/reports.py b/database/models/reports.py index 249581e49..9a899a9e1 100644 --- a/database/models/reports.py +++ b/database/models/reports.py @@ -84,7 +84,7 @@ class Upload(CodecovBaseModel, MixinBaseClass): name = Column(types.String(100)) provider = Column(types.String(50)) report_id = Column(types.BigInteger, ForeignKey("reports_commitreport.id")) - report = relationship( + report: CommitReport = relationship( "CommitReport", foreign_keys=[report_id], back_populates="uploads" ) state = Column(types.String(100), nullable=False) diff --git a/rollouts/__init__.py b/rollouts/__init__.py index d8db69c21..3ad0271f5 100644 --- a/rollouts/__init__.py +++ b/rollouts/__init__.py @@ -22,3 +22,11 @@ def repo_slug(repo: Repository) -> str: "gitlab/codecov": "enabled", }, ) + +# Eventually we want all repos to use this +# This flag will just help us with the rollout process +USE_LABEL_INDEX_IN_REPORT_PROCESSING_BY_REPO_SLUG = Feature( + "use_label_index_in_report_processing", + 0.0, + overrides={"github/giovanni-guidini/sentry": "enabled"}, +) diff --git a/services/archive.py b/services/archive.py index e85d94d69..023cb297a 100644 --- a/services/archive.py +++ b/services/archive.py @@ -4,11 +4,12 @@ from datetime import datetime from enum import Enum from hashlib import md5 -from typing import Any +from typing import Dict from uuid import uuid4 from shared.config import get_config from shared.storage.base import BaseStorageService +from shared.storage.exceptions import FileNotInStorageError from shared.utils.ReportEncoder import ReportEncoder from helpers.metrics import metrics @@ -19,6 +20,9 @@ class MinioEndpoints(Enum): chunks = "{version}/repos/{repo_hash}/commits/{commitid}/{chunks_file_name}.txt" + label_index = ( + "{version}/repos/{repo_hash}/commits/{commitid}/{label_index_file_name}.json" + ) json_data = "{version}/repos/{repo_hash}/commits/{commitid}/json_data/{table}/{field}/{external_id}.json" json_data_no_commit = ( "{version}/repos/{repo_hash}/json_data/{table}/{field}/{external_id}.json" @@ -226,6 +230,20 @@ def write_json_data_to_storage( self.write_file(path, stringified_data) return path + def write_label_index(self, commit_sha, json_data, report_code=None) -> str: + label_index_file_name = ( + report_code + "_" if report_code is not None else "" + ) + "labels_index" + path = MinioEndpoints.label_index.get_path( + version="v4", + repo_hash=self.storage_hash, + commitid=commit_sha, + label_index_file_name=label_index_file_name, + ) + string_data = json.dumps(json_data) + self.write_file(path, string_data) + return path + """ Convenience method to write a chunks.txt file to storage. """ @@ -286,6 +304,22 @@ def read_chunks(self, commit_sha, report_code=None) -> str: return self.read_file(path).decode(errors="replace") + def read_label_index(self, commit_sha, report_code=None) -> Dict[str, str]: + label_index_file_name = ( + report_code + "_" if report_code is not None else "" + ) + "labels_index" + path = MinioEndpoints.label_index.get_path( + version="v4", + repo_hash=self.storage_hash, + commitid=commit_sha, + label_index_file_name=label_index_file_name, + ) + + try: + return json.loads(self.read_file(path).decode(errors="replace")) + except FileNotInStorageError: + return dict() + """ Delete a chunk file from the archive """ diff --git a/services/report/__init__.py b/services/report/__init__.py index 5bc59a5c4..405baa031 100644 --- a/services/report/__init__.py +++ b/services/report/__init__.py @@ -786,7 +786,12 @@ def build_report_from_raw_content( try: with metrics.timer(f"{self.metrics_prefix}.process_report") as t: result = process_raw_upload( - self.current_yaml, master, raw_uploaded_report, flags, session + self.current_yaml, + master, + raw_uploaded_report, + flags, + session, + upload=upload, ) report = result.report log.info( diff --git a/services/report/labels_index.py b/services/report/labels_index.py new file mode 100644 index 000000000..8140e3f0d --- /dev/null +++ b/services/report/labels_index.py @@ -0,0 +1,46 @@ +from typing import Dict + +from shared.reports.resources import Report + +from database.models.reports import CommitReport +from services.archive import ArchiveService + + +class LabelsIndexService(object): + _archive_client: ArchiveService + + def __init__(self, commit_report: CommitReport) -> None: + self.commit_report = commit_report + self._archive_client = ArchiveService( + repository=commit_report.commit.repository + ) + self.commit_sha = commit_report.commit.commitid + + def set_label_idx(self, report: Report): + # TODO: Needs shared update. + # if report._labels_index is not None: + # raise Exception( + # "Trying to set labels_index of Report, but it's already set" + # ) + # Load label index from storage + # JSON uses strings are keys, but we are using ints. + map_with_str_keys = self._archive_client.read_label_index( + self.commit_sha, self.commit_report.code + ) + loaded_index = {int(k): v for k, v in map_with_str_keys.items()} + return loaded_index + # TODO: Needs shared update. + # report.set_label_idx(loaded_index) + + def unset_label_idx(self, report: Report, label_index: Dict[str, str]): + # Write the updated index back into storage + # TODO: Needs shared update + # self._archive_client.write_label_index( + # self.commit_sha, report._labels_index, self.commit_report.code + # ) + self._archive_client.write_label_index( + self.commit_sha, label_index, self.commit_report.code + ) + # Remove reference to it + # TODO: Needs shared update + # report.unset_label_idx() diff --git a/services/report/raw_upload_processor.py b/services/report/raw_upload_processor.py index 4ac1ccee6..1a81535be 100644 --- a/services/report/raw_upload_processor.py +++ b/services/report/raw_upload_processor.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import json import logging import random import typing @@ -9,10 +10,12 @@ from shared.reports.resources import Report from shared.utils.sessions import Session, SessionType +from database.models.reports import Upload from helpers.exceptions import ReportEmptyError from helpers.labels import get_all_report_labels, get_labels_per_session +from rollouts import USE_LABEL_INDEX_IN_REPORT_PROCESSING_BY_REPO_SLUG, repo_slug from services.path_fixer import PathFixer -from services.report.fixes import get_fixes_from_raw +from services.report.labels_index import LabelsIndexService from services.report.parser.types import ParsedRawReport from services.report.report_builder import ReportBuilder, SpecialLabelsEnum from services.report.report_processor import process_report @@ -48,7 +51,12 @@ class UploadProcessingResult(object): @sentry_sdk.trace def process_raw_upload( - commit_yaml, original_report, reports: ParsedRawReport, flags, session=None + commit_yaml, + original_report, + reports: ParsedRawReport, + flags, + session=None, + upload: Upload = None, ) -> UploadProcessingResult: toc, env = None, None @@ -129,7 +137,11 @@ def process_raw_upload( if not temporary_report: raise ReportEmptyError("No files found in report.") session_manipulation_result = _adjust_sessions( - original_report, temporary_report, session, commit_yaml + original_report, + temporary_report, + to_merge_session=session, + current_yaml=commit_yaml, + upload=upload, ) original_report.merge(temporary_report, joined=joined) session.totals = temporary_report.totals @@ -147,7 +159,16 @@ class SessionAdjustmentResult(object): partially_deleted_sessions: set -def _adjust_sessions(original_report, to_merge_report, to_merge_session, current_yaml): +# RUSTIFYME +@sentry_sdk.trace +def _adjust_sessions( + original_report: Report, + to_merge_report: Report, + to_merge_session, + current_yaml, + *, + upload: Upload = None, +): session_ids_to_fully_delete = [] session_ids_to_partially_delete = [] to_merge_flags = to_merge_session.flags or [] @@ -164,6 +185,24 @@ def _adjust_sessions(original_report, to_merge_report, to_merge_session, current for f in flags_under_carryforward_rules if f not in to_partially_overwrite_flags ] + if upload is None and to_partially_overwrite_flags: + log.warning("Upload is None, but there are partial_overwrite_flags present") + if ( + upload + and USE_LABEL_INDEX_IN_REPORT_PROCESSING_BY_REPO_SLUG.check_value( + repo_slug(upload.report.commit.repository), default=False + ) + and to_partially_overwrite_flags + ): + label_index_service = LabelsIndexService(upload.report) + # TODO: Needs shared upload + # if original_report._labels_index is None: + # label_index_service.set_label_idx(original_report) + # # Make sure that the labels in the reports are in a good state to merge them + # make_sure_orginal_report_is_using_label_ids(original_report) + # make_sure_label_indexes_match(original_report, to_merge_report) + # # After this point we don't need the label index anymore, so we can release it to save memory + # label_index_service.unset_label_idx(original_report) if to_fully_overwrite_flags or to_partially_overwrite_flags: for sess_id, curr_sess in original_report.sessions.items(): if curr_sess.session_type == SessionType.carriedforward: diff --git a/services/report/tests/unit/test_labels_index.py b/services/report/tests/unit/test_labels_index.py new file mode 100644 index 000000000..7bdc3650d --- /dev/null +++ b/services/report/tests/unit/test_labels_index.py @@ -0,0 +1,85 @@ +import pytest +from shared.reports.resources import Report + +from database.tests.factories.core import ReportFactory +from helpers.labels import SpecialLabelsEnum +from services.report.labels_index import ArchiveService, LabelsIndexService + + +class TestLabelsIndex(object): + def test_init(self, dbsession, mocker): + commit_report = ReportFactory() + dbsession.add(commit_report) + dbsession.flush() + + labels_index_service = LabelsIndexService(commit_report) + assert labels_index_service._archive_client is not None + assert labels_index_service.commit_sha == commit_report.commit.commitid + + def test_set_label_idx(self, dbsession, mocker): + commit_report = ReportFactory() + dbsession.add(commit_report) + dbsession.flush() + # Notice that the keys are strings + # because self._archive_client.read_label_index returns the contents of a JSON file, + # and JSON can only have string keys. + sample_label_index = { + "0": SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label, + "1": "some_label", + "2": "another_label", + } + mocker.patch.object( + ArchiveService, "read_label_index", return_value=sample_label_index + ) + report = Report() + # TODO: Needs shared update + # assert report._labels_index == None + label_service = LabelsIndexService(commit_report) + res = label_service.set_label_idx(report) + assert res == { + 0: SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label, + 1: "some_label", + 2: "another_label", + } + + # TODO: Needs shared update + # def test_set_label_idx_already_set(self, dbsession, mocker): + # commit_report = ReportFactory() + # dbsession.add(commit_report) + # dbsession.flush() + # mock_read = mocker.patch.object(ArchiveService, "read_label_index") + # sample_label_index = { + # 0: SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label, + # 1: "some_label", + # 2: "another_label", + # } + # report = Report() + # report._labels_index = sample_label_index + # with pytest.raises(Exception) as exp: + # label_service = LabelsIndexService(commit_report) + # label_service.set_label_idx(report) + # mock_read.assert_not_called() + # assert ( + # str(exp.value) + # == "Trying to set labels_index of Report, but it's already set" + # ) + + def test_unset_label_idx(self, dbsession, mocker): + commit_report = ReportFactory() + dbsession.add(commit_report) + dbsession.flush() + sample_label_index = { + 0: SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label, + 1: "some_label", + 2: "another_label", + } + mock_write = mocker.patch.object(ArchiveService, "write_label_index") + report = Report() + # TODO: Needs shared update + # report._labels_index = sample_label_index + label_service = LabelsIndexService(commit_report) + label_service.unset_label_idx(report, sample_label_index) + # assert report._labels_index == None + mock_write.assert_called_with( + commit_report.commit.commitid, sample_label_index, commit_report.code + ) diff --git a/services/report/tests/unit/test_sessions.py b/services/report/tests/unit/test_sessions.py index 4f4529c2f..f7179fbbd 100644 --- a/services/report/tests/unit/test_sessions.py +++ b/services/report/tests/unit/test_sessions.py @@ -1,4 +1,5 @@ import pytest +from mock import MagicMock from shared.reports.editable import EditableReport, EditableReportFile from shared.reports.resources import ( LineSession, @@ -11,6 +12,7 @@ from shared.reports.types import CoverageDatapoint from shared.yaml import UserYaml +from database.tests.factories.core import RepositoryFactory from helpers.labels import SpecialLabelsEnum from services.report.raw_upload_processor import ( SessionAdjustmentResult, @@ -415,7 +417,9 @@ def test_adjust_sessions_full_cf_only(self, sample_first_report): }, } - def test_adjust_sessions_partial_cf_only_no_changes(self, sample_first_report): + def test_adjust_sessions_partial_cf_only_no_changes( + self, sample_first_report, mocker + ): first_to_merge_session = Session(flags=["enterprise"], id=3) second_report = Report( sessions={first_to_merge_session.id: first_to_merge_session} @@ -433,12 +437,72 @@ def test_adjust_sessions_partial_cf_only_no_changes(self, sample_first_report): } } ) + mock_label_index_service = mocker.patch( + "services.report.raw_upload_processor.LabelsIndexService" + ) first_value = self.convert_report_to_better_readable(sample_first_report) assert _adjust_sessions( sample_first_report, second_report, first_to_merge_session, current_yaml ) == SessionAdjustmentResult([], [0]) after_result = self.convert_report_to_better_readable(sample_first_report) assert after_result == first_value + mock_label_index_service.assert_not_called() + + def test_adjust_sessions_partial_cf_only_no_changes_encoding_labels( + self, sample_first_report, mocker + ): + first_to_merge_session = Session(flags=["enterprise"], id=3) + second_report = Report( + sessions={first_to_merge_session.id: first_to_merge_session} + ) + current_yaml = UserYaml( + { + "flag_management": { + "individual_flags": [ + { + "name": "enterprise", + "carryforward_mode": "labels", + "carryforward": True, + } + ] + } + } + ) + first_value = self.convert_report_to_better_readable(sample_first_report) + upload = MagicMock( + name="fake_upload", + **{ + "report": MagicMock( + name="fake_commit_report", + **{ + "code": None, + "commit": MagicMock( + name="fake_commit", + **{ + "repository": RepositoryFactory( + name="sentry", + owner__username="giovanni-guidini", + owner__service="github", + ) + } + ), + } + ) + } + ) + mock_label_index_service = mocker.patch( + "services.report.raw_upload_processor.LabelsIndexService" + ) + assert _adjust_sessions( + sample_first_report, + second_report, + first_to_merge_session, + current_yaml, + upload=upload, + ) == SessionAdjustmentResult([], [0]) + after_result = self.convert_report_to_better_readable(sample_first_report) + assert after_result == first_value + assert mock_label_index_service.call_count == 1 def test_adjust_sessions_partial_cf_only_some_changes(self, sample_first_report): first_to_merge_session = Session(flags=["enterprise"], id=3) diff --git a/services/tests/unit/test_archive_service.py b/services/tests/unit/test_archive_service.py index 0d34c7645..8cff7d25a 100644 --- a/services/tests/unit/test_archive_service.py +++ b/services/tests/unit/test_archive_service.py @@ -1,8 +1,10 @@ import json from shared.storage import MinioStorageService +from shared.storage.exceptions import FileNotInStorageError from database.tests.factories import RepositoryFactory +from database.tests.factories.core import CommitFactory from services.archive import ArchiveService from test_utils.base import BaseTestCase @@ -136,3 +138,80 @@ def test_write_report_details_to_storage_no_commitid(self, mocker, dbsession): is_already_gzipped=False, reduced_redundancy=False, ) + + +class TestLabelIndex(object): + def test_write_label_index_to_storage(self, mocker, dbsession): + commit = CommitFactory() + dbsession.add(commit) + dbsession.flush() + mock_write_file = mocker.patch.object(MinioStorageService, "write_file") + archive_service = ArchiveService(repository=commit.repository) + data = {1: "some_label", 2: "another_label", 3: "yet_another_label"} + path_for_default_code = archive_service.write_label_index( + commit.commitid, data, report_code=None + ) + assert ( + path_for_default_code + == f"v4/repos/{archive_service.storage_hash}/commits/{commit.commitid}/labels_index.json" + ) + mock_write_file.assert_called_with( + archive_service.root, + path_for_default_code, + json.dumps(data), + is_already_gzipped=False, + reduced_redundancy=False, + ) + + path_for_different_code = archive_service.write_label_index( + commit.commitid, data, report_code="local" + ) + assert ( + path_for_different_code + == f"v4/repos/{archive_service.storage_hash}/commits/{commit.commitid}/local_labels_index.json" + ) + mock_write_file.assert_called_with( + archive_service.root, + path_for_different_code, + json.dumps(data), + is_already_gzipped=False, + reduced_redundancy=False, + ) + + def test_read_label_index_from_storage(self, mocker, dbsession): + commit = CommitFactory() + dbsession.add(commit) + dbsession.flush() + # Notice that the keys are string. JSON uses strings as keys. + # It's not the responsibility of read_label_index to fix this detail + data = {"1": "some_label", "2": "another_label", "3": "yet_another_label"} + mock_read_file = mocker.patch.object( + ArchiveService, "read_file", return_value=json.dumps(data).encode() + ) + archive_service = ArchiveService(repository=commit.repository) + + assert archive_service.read_label_index(commit.commitid) == data + mock_read_file.assert_called_with( + f"v4/repos/{archive_service.storage_hash}/commits/{commit.commitid}/labels_index.json" + ) + + assert ( + archive_service.read_label_index(commit.commitid, report_code="local") + == data + ) + mock_read_file.assert_called_with( + f"v4/repos/{archive_service.storage_hash}/commits/{commit.commitid}/local_labels_index.json" + ) + + def test_read_label_index_from_storage_file_not_found(self, mocker, dbsession): + commit = CommitFactory() + dbsession.add(commit) + dbsession.flush() + mock_read_file = mocker.patch.object( + ArchiveService, "read_file", side_effect=FileNotInStorageError + ) + archive_service = ArchiveService(repository=commit.repository) + assert archive_service.read_label_index(commit.commitid) == {} + mock_read_file.assert_called_with( + f"v4/repos/{archive_service.storage_hash}/commits/{commit.commitid}/labels_index.json" + )