Skip to content

Commit

Permalink
Prep the terrain for reports with label compression. (#188)
Browse files Browse the repository at this point in the history
context: codecov/engineering-team #768

Creates a rollout for label compression. This is gonna help us to test and safely release the feature in the wild.
Notice that currently the label compression does nothing.
There are comments similar to `TODO: needs shared update`.
The update in question is codecov/shared#79

So these changes mostly prep the terrain for future changes that will actually do something,
and add the guardrails to avoid issues when deploying.
In particular it adds some helper methods to the `ArchiveService`, creates a kinda-stubbed `LabelsIndexService`,
passes more data to the `_adjust_sessions` portion of `raw_upload_processor` where most changes will occur.

If you're curious as to what the end result will probably look like see #180
  • Loading branch information
giovanni-guidini authored Nov 29, 2023
1 parent 8e60c86 commit f751090
Show file tree
Hide file tree
Showing 9 changed files with 368 additions and 8 deletions.
2 changes: 1 addition & 1 deletion database/models/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class Upload(CodecovBaseModel, MixinBaseClass):
name = Column(types.String(100))
provider = Column(types.String(50))
report_id = Column(types.BigInteger, ForeignKey("reports_commitreport.id"))
report = relationship(
report: CommitReport = relationship(
"CommitReport", foreign_keys=[report_id], back_populates="uploads"
)
state = Column(types.String(100), nullable=False)
Expand Down
8 changes: 8 additions & 0 deletions rollouts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,11 @@ def repo_slug(repo: Repository) -> str:
"gitlab/codecov": "enabled",
},
)

# Eventually we want all repos to use this
# This flag will just help us with the rollout process
USE_LABEL_INDEX_IN_REPORT_PROCESSING_BY_REPO_SLUG = Feature(
"use_label_index_in_report_processing",
0.0,
overrides={"github/giovanni-guidini/sentry": "enabled"},
)
36 changes: 35 additions & 1 deletion services/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
from datetime import datetime
from enum import Enum
from hashlib import md5
from typing import Any
from typing import Dict
from uuid import uuid4

from shared.config import get_config
from shared.storage.base import BaseStorageService
from shared.storage.exceptions import FileNotInStorageError
from shared.utils.ReportEncoder import ReportEncoder

from helpers.metrics import metrics
Expand All @@ -19,6 +20,9 @@

class MinioEndpoints(Enum):
chunks = "{version}/repos/{repo_hash}/commits/{commitid}/{chunks_file_name}.txt"
label_index = (
"{version}/repos/{repo_hash}/commits/{commitid}/{label_index_file_name}.json"
)
json_data = "{version}/repos/{repo_hash}/commits/{commitid}/json_data/{table}/{field}/{external_id}.json"
json_data_no_commit = (
"{version}/repos/{repo_hash}/json_data/{table}/{field}/{external_id}.json"
Expand Down Expand Up @@ -226,6 +230,20 @@ def write_json_data_to_storage(
self.write_file(path, stringified_data)
return path

def write_label_index(self, commit_sha, json_data, report_code=None) -> str:
label_index_file_name = (
report_code + "_" if report_code is not None else ""
) + "labels_index"
path = MinioEndpoints.label_index.get_path(
version="v4",
repo_hash=self.storage_hash,
commitid=commit_sha,
label_index_file_name=label_index_file_name,
)
string_data = json.dumps(json_data)
self.write_file(path, string_data)
return path

"""
Convenience method to write a chunks.txt file to storage.
"""
Expand Down Expand Up @@ -286,6 +304,22 @@ def read_chunks(self, commit_sha, report_code=None) -> str:

return self.read_file(path).decode(errors="replace")

def read_label_index(self, commit_sha, report_code=None) -> Dict[str, str]:
label_index_file_name = (
report_code + "_" if report_code is not None else ""
) + "labels_index"
path = MinioEndpoints.label_index.get_path(
version="v4",
repo_hash=self.storage_hash,
commitid=commit_sha,
label_index_file_name=label_index_file_name,
)

try:
return json.loads(self.read_file(path).decode(errors="replace"))
except FileNotInStorageError:
return dict()

"""
Delete a chunk file from the archive
"""
Expand Down
7 changes: 6 additions & 1 deletion services/report/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,12 @@ def build_report_from_raw_content(
try:
with metrics.timer(f"{self.metrics_prefix}.process_report") as t:
result = process_raw_upload(
self.current_yaml, master, raw_uploaded_report, flags, session
self.current_yaml,
master,
raw_uploaded_report,
flags,
session,
upload=upload,
)
report = result.report
log.info(
Expand Down
46 changes: 46 additions & 0 deletions services/report/labels_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from typing import Dict

from shared.reports.resources import Report

from database.models.reports import CommitReport
from services.archive import ArchiveService


class LabelsIndexService(object):
_archive_client: ArchiveService

def __init__(self, commit_report: CommitReport) -> None:
self.commit_report = commit_report
self._archive_client = ArchiveService(
repository=commit_report.commit.repository
)
self.commit_sha = commit_report.commit.commitid

def set_label_idx(self, report: Report):
# TODO: Needs shared update.
# if report._labels_index is not None:
# raise Exception(
# "Trying to set labels_index of Report, but it's already set"
# )
# Load label index from storage
# JSON uses strings are keys, but we are using ints.
map_with_str_keys = self._archive_client.read_label_index(
self.commit_sha, self.commit_report.code
)
loaded_index = {int(k): v for k, v in map_with_str_keys.items()}
return loaded_index
# TODO: Needs shared update.
# report.set_label_idx(loaded_index)

def unset_label_idx(self, report: Report, label_index: Dict[str, str]):
# Write the updated index back into storage
# TODO: Needs shared update
# self._archive_client.write_label_index(
# self.commit_sha, report._labels_index, self.commit_report.code
# )
self._archive_client.write_label_index(
self.commit_sha, label_index, self.commit_report.code
)
# Remove reference to it
# TODO: Needs shared update
# report.unset_label_idx()
47 changes: 43 additions & 4 deletions services/report/raw_upload_processor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-

import json
import logging
import random
import typing
Expand All @@ -9,10 +10,12 @@
from shared.reports.resources import Report
from shared.utils.sessions import Session, SessionType

from database.models.reports import Upload
from helpers.exceptions import ReportEmptyError
from helpers.labels import get_all_report_labels, get_labels_per_session
from rollouts import USE_LABEL_INDEX_IN_REPORT_PROCESSING_BY_REPO_SLUG, repo_slug
from services.path_fixer import PathFixer
from services.report.fixes import get_fixes_from_raw
from services.report.labels_index import LabelsIndexService
from services.report.parser.types import ParsedRawReport
from services.report.report_builder import ReportBuilder, SpecialLabelsEnum
from services.report.report_processor import process_report
Expand Down Expand Up @@ -48,7 +51,12 @@ class UploadProcessingResult(object):

@sentry_sdk.trace
def process_raw_upload(
commit_yaml, original_report, reports: ParsedRawReport, flags, session=None
commit_yaml,
original_report,
reports: ParsedRawReport,
flags,
session=None,
upload: Upload = None,
) -> UploadProcessingResult:
toc, env = None, None

Expand Down Expand Up @@ -129,7 +137,11 @@ def process_raw_upload(
if not temporary_report:
raise ReportEmptyError("No files found in report.")
session_manipulation_result = _adjust_sessions(
original_report, temporary_report, session, commit_yaml
original_report,
temporary_report,
to_merge_session=session,
current_yaml=commit_yaml,
upload=upload,
)
original_report.merge(temporary_report, joined=joined)
session.totals = temporary_report.totals
Expand All @@ -147,7 +159,16 @@ class SessionAdjustmentResult(object):
partially_deleted_sessions: set


def _adjust_sessions(original_report, to_merge_report, to_merge_session, current_yaml):
# RUSTIFYME
@sentry_sdk.trace
def _adjust_sessions(
original_report: Report,
to_merge_report: Report,
to_merge_session,
current_yaml,
*,
upload: Upload = None,
):
session_ids_to_fully_delete = []
session_ids_to_partially_delete = []
to_merge_flags = to_merge_session.flags or []
Expand All @@ -164,6 +185,24 @@ def _adjust_sessions(original_report, to_merge_report, to_merge_session, current
for f in flags_under_carryforward_rules
if f not in to_partially_overwrite_flags
]
if upload is None and to_partially_overwrite_flags:
log.warning("Upload is None, but there are partial_overwrite_flags present")
if (
upload
and USE_LABEL_INDEX_IN_REPORT_PROCESSING_BY_REPO_SLUG.check_value(
repo_slug(upload.report.commit.repository), default=False
)
and to_partially_overwrite_flags
):
label_index_service = LabelsIndexService(upload.report)
# TODO: Needs shared upload
# if original_report._labels_index is None:
# label_index_service.set_label_idx(original_report)
# # Make sure that the labels in the reports are in a good state to merge them
# make_sure_orginal_report_is_using_label_ids(original_report)
# make_sure_label_indexes_match(original_report, to_merge_report)
# # After this point we don't need the label index anymore, so we can release it to save memory
# label_index_service.unset_label_idx(original_report)
if to_fully_overwrite_flags or to_partially_overwrite_flags:
for sess_id, curr_sess in original_report.sessions.items():
if curr_sess.session_type == SessionType.carriedforward:
Expand Down
85 changes: 85 additions & 0 deletions services/report/tests/unit/test_labels_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pytest
from shared.reports.resources import Report

from database.tests.factories.core import ReportFactory
from helpers.labels import SpecialLabelsEnum
from services.report.labels_index import ArchiveService, LabelsIndexService


class TestLabelsIndex(object):
def test_init(self, dbsession, mocker):
commit_report = ReportFactory()
dbsession.add(commit_report)
dbsession.flush()

labels_index_service = LabelsIndexService(commit_report)
assert labels_index_service._archive_client is not None
assert labels_index_service.commit_sha == commit_report.commit.commitid

def test_set_label_idx(self, dbsession, mocker):
commit_report = ReportFactory()
dbsession.add(commit_report)
dbsession.flush()
# Notice that the keys are strings
# because self._archive_client.read_label_index returns the contents of a JSON file,
# and JSON can only have string keys.
sample_label_index = {
"0": SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label,
"1": "some_label",
"2": "another_label",
}
mocker.patch.object(
ArchiveService, "read_label_index", return_value=sample_label_index
)
report = Report()
# TODO: Needs shared update
# assert report._labels_index == None
label_service = LabelsIndexService(commit_report)
res = label_service.set_label_idx(report)
assert res == {
0: SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label,
1: "some_label",
2: "another_label",
}

# TODO: Needs shared update
# def test_set_label_idx_already_set(self, dbsession, mocker):
# commit_report = ReportFactory()
# dbsession.add(commit_report)
# dbsession.flush()
# mock_read = mocker.patch.object(ArchiveService, "read_label_index")
# sample_label_index = {
# 0: SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label,
# 1: "some_label",
# 2: "another_label",
# }
# report = Report()
# report._labels_index = sample_label_index
# with pytest.raises(Exception) as exp:
# label_service = LabelsIndexService(commit_report)
# label_service.set_label_idx(report)
# mock_read.assert_not_called()
# assert (
# str(exp.value)
# == "Trying to set labels_index of Report, but it's already set"
# )

def test_unset_label_idx(self, dbsession, mocker):
commit_report = ReportFactory()
dbsession.add(commit_report)
dbsession.flush()
sample_label_index = {
0: SpecialLabelsEnum.CODECOV_ALL_LABELS_PLACEHOLDER.corresponding_label,
1: "some_label",
2: "another_label",
}
mock_write = mocker.patch.object(ArchiveService, "write_label_index")
report = Report()
# TODO: Needs shared update
# report._labels_index = sample_label_index
label_service = LabelsIndexService(commit_report)
label_service.unset_label_idx(report, sample_label_index)
# assert report._labels_index == None
mock_write.assert_called_with(
commit_report.commit.commitid, sample_label_index, commit_report.code
)
Loading

0 comments on commit f751090

Please sign in to comment.