Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/develop' into feature/preprint…
Browse files Browse the repository at this point in the history
…s-doi-versioning
  • Loading branch information
cslzchen committed Dec 18, 2024
2 parents 4b8894e + 8643e3f commit 2706450
Show file tree
Hide file tree
Showing 25 changed files with 620 additions and 32 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@

We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO.

24.11.0 (2024-12-11)
====================
- Institutional Dashboard Project Bugfix Release

24.10.0 (2024-12-05)
====================

- Migrate Preprint Affilations
- Add OOPSpam and Akismet metrics to spam report
- Add PrivateSpamMetricsReport
- Update PrivateSpamMetricsReporter to work with refactored MonthlyReporter
- Fix duplicate reports when run for past years
- Fix counted-usage clobbers

24.09.0 (2024-11-14)
====================

Expand Down
2 changes: 2 additions & 0 deletions api/institutions/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ class Meta:
})

id = IDField(source='meta.id', read_only=True)
report_yearmonth = YearmonthField(read_only=True)
user_name = ser.CharField(read_only=True)
department = ser.CharField(read_only=True, source='department_name')
orcid_id = ser.CharField(read_only=True)
Expand Down Expand Up @@ -372,6 +373,7 @@ class Meta:

id = IDField(read_only=True)

report_yearmonth = YearmonthField(read_only=True)
user_count = ser.IntegerField(read_only=True)
public_project_count = ser.IntegerField(read_only=True)
private_project_count = ser.IntegerField(read_only=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ def test_get_report(self, app, url, institutional_admin, institution, reports, u
assert data['type'] == 'institution-summary-metrics'

attributes = data['attributes']
assert attributes['report_yearmonth'] == '2024-08'
assert attributes['user_count'] == 200
assert attributes['public_project_count'] == 150
assert attributes['private_project_count'] == 125
Expand Down Expand Up @@ -254,6 +255,7 @@ def test_get_report_with_multiple_months_and_institutions(

attributes = data['attributes']

assert attributes['report_yearmonth'] == '2024-09'
assert attributes['user_count'] == 250
assert attributes['public_project_count'] == 200
assert attributes['private_project_count'] == 150
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,7 @@ def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institu
response_body = resp.text
expected_response = [
[
'report_yearmonth',
'account_creation_date',
'department',
'embargoed_registration_count',
Expand All @@ -460,6 +461,7 @@ def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institu
'user_name'
],
[
'2024-08',
'2018-02',
'Center, \t Greatest Ever',
'1',
Expand Down Expand Up @@ -512,6 +514,7 @@ def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institu
month_last_login='2018-02',
)
expected_data.append([
'2024-08',
'2018-02',
'QBatman',
'1',
Expand Down Expand Up @@ -552,6 +555,7 @@ def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institu
response_rows = list(reader)
# Validate header row
expected_header = [
'report_yearmonth',
'account_creation_date',
'department',
'embargoed_registration_count',
Expand Down Expand Up @@ -606,6 +610,7 @@ def test_get_report_format_table_json(self, app, url, institutional_admin, insti
response_data = json.loads(resp.body)
expected_data = [
{
'report_yearmonth': '2024-08',
'account_creation_date': '2018-02',
'department': 'Safety "The Wolverine" Weapon X',
'embargoed_registration_count': 1,
Expand Down
16 changes: 8 additions & 8 deletions api_tests/metrics/test_counted_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ def test_by_client_session_id(self, app, mock_save, user):
assert resp.status_code == 201
assert_saved_with(
mock_save,
# doc_id: sha256(b'http://example.foo/|http://example.foo/blahblah/blee|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3').hexdigest()
expected_doc_id='55fffffdc0d674d15a5e8763d14e4ae90f658fbfb6fbf94f88a5d24978f02e72',
# doc_id: sha256(b'http://example.foo/|http://example.foo/blahblah/blee|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|api,view').hexdigest()
expected_doc_id='3239044c7462dd318edd0522a0ed7d84b9c6502ef16cb40dfcae6c1f456d57a2',
expected_attrs={
'platform_iri': 'http://example.foo/',
'item_guid': 'zyxwv',
Expand Down Expand Up @@ -132,8 +132,8 @@ def test_by_client_session_id_anon(self, app, mock_save):
assert resp.status_code == 201
assert_saved_with(
mock_save,
# doc_id: sha256(b'http://example.foo/|http://example.foo/bliz/|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3').hexdigest()
expected_doc_id='e559ffbc4bd3e3e69252d34c273f0e771ec89ee455ec9b60fbbadf3944e4af4e',
# doc_id: sha256(b'http://example.foo/|http://example.foo/bliz/|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|view,web').hexdigest()
expected_doc_id='d01759e963893f9dc9b2ccf016a5ef29135673779802b5578f31449543677e82',
expected_attrs={
'platform_iri': 'http://example.foo/',
'item_guid': 'zyxwv',
Expand Down Expand Up @@ -166,8 +166,8 @@ def test_by_user_auth(self, app, mock_save, user):
assert resp.status_code == 201
assert_saved_with(
mock_save,
# doc_id: sha256(b'http://example.foo/|http://osf.io/mst3k|ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a|1981-01-01|3').hexdigest()
expected_doc_id='743494d8a55079b91e202da1dbdfce5aea72e310c57a34b36df2c2af5ed4d362',
# doc_id: sha256(b'http://example.foo/|http://osf.io/mst3k|ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a|1981-01-01|3|view,web').hexdigest()
expected_doc_id='7b8bc27c6d90fb45aa5bbd02deceba9f7384ed61b9a6e7253317c262020b94c2',
expected_attrs={
'platform_iri': 'http://example.foo/',
'item_guid': 'yxwvu',
Expand Down Expand Up @@ -196,8 +196,8 @@ def test_by_useragent_header(self, app, mock_save):
assert resp.status_code == 201
assert_saved_with(
mock_save,
# doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3').hexdigest()
expected_doc_id='a50ac1b2dc1c918cdea7be50b005117fdb6ee00ea069ca3aa4aaf03c0f905fa0',
# doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3|api,view').hexdigest()
expected_doc_id='d669528b30f443ffe506e183537af9624ef290090e90a200ecce7b7ca19c77f7',
expected_attrs={
'platform_iri': 'http://example.foo/',
'item_guid': 'yxwvu',
Expand Down
34 changes: 34 additions & 0 deletions osf/external/askismet/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,37 @@ def submit_ham(self, user_ip, user_agent, **kwargs):
)
if res.status_code != requests.codes.ok:
raise AkismetClientError(reason=res.text)

def get_flagged_count(self, start_date, end_date, category='node'):
from osf.models import NodeLog, PreprintLog

if category not in ['node', 'preprint']:
raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.")

log_model = NodeLog if category == 'node' else PreprintLog

flagged_count = log_model.objects.filter(
action=log_model.FLAG_SPAM,
created__gt=start_date,
created__lt=end_date,
**{f'{category}__spam_data__who_flagged__in': ['akismet', 'both']}
).count()

return flagged_count

def get_hammed_count(self, start_date, end_date, category='node'):
from osf.models import NodeLog, PreprintLog

if category not in ['node', 'preprint']:
raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.")

log_model = NodeLog if category == 'node' else PreprintLog

hammed_count = log_model.objects.filter(
action=log_model.CONFIRM_HAM,
created__gt=start_date,
created__lt=end_date,
**{f'{category}__spam_data__who_flagged__in': ['akismet', 'both']}
).count()

return hammed_count
34 changes: 34 additions & 0 deletions osf/external/oopspam/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,37 @@ def check_content(self, user_ip, content, **kwargs):

# OOPSpam returns a spam score out of 6. 3 or higher indicates spam
return spam_score >= settings.OOPSPAM_SPAM_LEVEL, resp_json

def get_flagged_count(self, start_date, end_date, category='node'):
from osf.models import NodeLog, PreprintLog

if category not in ['node', 'preprint']:
raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.")

log_model = NodeLog if category == 'node' else PreprintLog

flagged_count = log_model.objects.filter(
action=log_model.FLAG_SPAM,
created__gt=start_date,
created__lt=end_date,
**{f'{category}__spam_data__who_flagged__in': ['oopspam', 'both']}
).count()

return flagged_count

def get_hammed_count(self, start_date, end_date, category='node'):
from osf.models import NodeLog, PreprintLog

if category not in ['node', 'preprint']:
raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.")

log_model = NodeLog if category == 'node' else PreprintLog

hammed_count = log_model.objects.filter(
action=log_model.CONFIRM_HAM,
created__gt=start_date,
created__lt=end_date,
**{f'{category}__spam_data__who_flagged__in': ['oopspam', 'both']}
).count()

return hammed_count
118 changes: 118 additions & 0 deletions osf/management/commands/migrate_preprint_affiliation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import datetime
import logging

from django.core.management.base import BaseCommand
from django.db import transaction
from django.db.models import F, Exists, OuterRef

from osf.models import PreprintContributor, InstitutionAffiliation

logger = logging.getLogger(__name__)

AFFILIATION_TARGET_DATE = datetime.datetime(2024, 9, 19, 14, 37, 48, tzinfo=datetime.timezone.utc)


class Command(BaseCommand):
"""Assign affiliations from users to preprints where they have write or admin permissions, with optional exclusion by user GUIDs."""

help = 'Assign affiliations from users to preprints where they have write or admin permissions.'

def add_arguments(self, parser):
parser.add_argument(
'--exclude-guids',
nargs='+',
dest='exclude_guids',
help='List of user GUIDs to exclude from affiliation assignment'
)
parser.add_argument(
'--dry-run',
action='store_true',
dest='dry_run',
help='If true, performs a dry run without making changes'
)
parser.add_argument(
'--batch-size',
type=int,
default=1000,
dest='batch_size',
help='Number of contributors to process in each batch'
)

def handle(self, *args, **options):
start_time = datetime.datetime.now()
logger.info(f'Script started at: {start_time}')

exclude_guids = set(options.get('exclude_guids') or [])
dry_run = options.get('dry_run', False)
batch_size = options.get('batch_size', 1000)

if dry_run:
logger.info('Dry run mode activated.')

processed_count, updated_count = assign_affiliations_to_preprints(
exclude_guids=exclude_guids,
dry_run=dry_run,
batch_size=batch_size
)

finish_time = datetime.datetime.now()
logger.info(f'Script finished at: {finish_time}')
logger.info(f'Total processed: {processed_count}, Updated: {updated_count}')
logger.info(f'Total run time: {finish_time - start_time}')


def assign_affiliations_to_preprints(exclude_guids=None, dry_run=True, batch_size=1000):
exclude_guids = exclude_guids or set()
processed_count = updated_count = 0

# Subquery to check if the user has any affiliated institutions
user_has_affiliations = Exists(
InstitutionAffiliation.objects.filter(
user=OuterRef('user')
)
)

contributors_qs = PreprintContributor.objects.filter(
preprint__preprintgroupobjectpermission__permission__codename__in=['write_preprint'],
preprint__preprintgroupobjectpermission__group__user=F('user'),
).filter(
user_has_affiliations
).select_related(
'user',
'preprint'
).exclude(
user__guids___id__in=exclude_guids
).order_by('pk') # Ensure consistent ordering for batching

total_contributors = contributors_qs.count()
logger.info(f'Total contributors to process: {total_contributors}')

# Process contributors in batches
with transaction.atomic():
for offset in range(0, total_contributors, batch_size):
# Use select_for_update() to ensure query hits the primary database
batch_contributors = contributors_qs[offset:offset + batch_size].select_for_update()

logger.info(f'Processing contributors {offset + 1} to {min(offset + batch_size, total_contributors)}')

for contributor in batch_contributors:
user = contributor.user
preprint = contributor.preprint

if preprint.created > AFFILIATION_TARGET_DATE:
continue

user_institutions = user.get_affiliated_institutions()
processed_count += 1
if not dry_run:
preprint.affiliated_institutions.add(*user_institutions)
updated_count += 1
logger.info(
f'Assigned {len(user_institutions)} affiliations from user <{user._id}> to preprint <{preprint._id}>.'
)
else:
logger.info(
f'Dry run: Would assign {len(user_institutions)} affiliations from user <{user._id}> to preprint <{preprint._id}>.'
)

return processed_count, updated_count
1 change: 1 addition & 0 deletions osf/metrics/counted_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def _fill_document_id(counted_usage):
counted_usage.session_id,
counted_usage.timestamp.date(),
time_window,
','.join(sorted(counted_usage.action_labels)),
)


Expand Down
2 changes: 2 additions & 0 deletions osf/metrics/reporters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .public_item_usage import PublicItemUsageReporter
from .user_count import UserCountReporter
from .spam_count import SpamCountReporter
from .private_spam_metrics import PrivateSpamMetricsReporter


class AllDailyReporters(enum.Enum):
Expand All @@ -32,3 +33,4 @@ class AllMonthlyReporters(enum.Enum):
INSTITUTIONAL_USERS = InstitutionalUsersReporter
INSTITUTIONAL_SUMMARY = InstitutionalSummaryMonthlyReporter
ITEM_USAGE = PublicItemUsageReporter
PRIVATE_SPAM_METRICS = PrivateSpamMetricsReporter
Loading

0 comments on commit 2706450

Please sign in to comment.