Merge remote-tracking branch 'upstream/develop' into feature/preprint…

…s-doi-versioning
CenterForOpenScience · Dec 18, 2024 · 2706450 · 2706450
2 parents 4b8894e + 8643e3f
commit 2706450
Show file tree

Hide file tree

Showing 25 changed files with 620 additions and 32 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -2,6 +2,20 @@
 
 We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO.
 
+24.11.0 (2024-12-11)
+====================
+- Institutional Dashboard Project Bugfix Release
+
+24.10.0 (2024-12-05)
+====================
+
+- Migrate Preprint Affilations
+- Add OOPSpam and Akismet metrics to spam report
+- Add PrivateSpamMetricsReport
+- Update PrivateSpamMetricsReporter to work with refactored MonthlyReporter
+- Fix duplicate reports when run for past years
+- Fix counted-usage clobbers
+
 24.09.0 (2024-11-14)
 ====================
 

diff --git a/api/institutions/serializers.py b/api/institutions/serializers.py
@@ -330,6 +330,7 @@ class Meta:
     })
 
     id = IDField(source='meta.id', read_only=True)
+    report_yearmonth = YearmonthField(read_only=True)
     user_name = ser.CharField(read_only=True)
     department = ser.CharField(read_only=True, source='department_name')
     orcid_id = ser.CharField(read_only=True)
@@ -372,6 +373,7 @@ class Meta:
 
     id = IDField(read_only=True)
 
+    report_yearmonth = YearmonthField(read_only=True)
     user_count = ser.IntegerField(read_only=True)
     public_project_count = ser.IntegerField(read_only=True)
     private_project_count = ser.IntegerField(read_only=True)

diff --git a/api_tests/institutions/views/test_institution_summary_metrics.py b/api_tests/institutions/views/test_institution_summary_metrics.py
@@ -188,6 +188,7 @@ def test_get_report(self, app, url, institutional_admin, institution, reports, u
         assert data['type'] == 'institution-summary-metrics'
 
         attributes = data['attributes']
+        assert attributes['report_yearmonth'] == '2024-08'
         assert attributes['user_count'] == 200
         assert attributes['public_project_count'] == 150
         assert attributes['private_project_count'] == 125
@@ -254,6 +255,7 @@ def test_get_report_with_multiple_months_and_institutions(
 
         attributes = data['attributes']
 
+        assert attributes['report_yearmonth'] == '2024-09'
         assert attributes['user_count'] == 250
         assert attributes['public_project_count'] == 200
         assert attributes['private_project_count'] == 150

diff --git a/api_tests/institutions/views/test_institution_user_metric_list.py b/api_tests/institutions/views/test_institution_user_metric_list.py
@@ -445,6 +445,7 @@ def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institu
         response_body = resp.text
         expected_response = [
             [
+                'report_yearmonth',
                 'account_creation_date',
                 'department',
                 'embargoed_registration_count',
@@ -460,6 +461,7 @@ def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institu
                 'user_name'
             ],
             [
+                '2024-08',
                 '2018-02',
                 'Center, \t Greatest Ever',
                 '1',
@@ -512,6 +514,7 @@ def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institu
                 month_last_login='2018-02',
             )
             expected_data.append([
+                '2024-08',
                 '2018-02',
                 'QBatman',
                 '1',
@@ -552,6 +555,7 @@ def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institu
                 response_rows = list(reader)
                 # Validate header row
                 expected_header = [
+                    'report_yearmonth',
                     'account_creation_date',
                     'department',
                     'embargoed_registration_count',
@@ -606,6 +610,7 @@ def test_get_report_format_table_json(self, app, url, institutional_admin, insti
         response_data = json.loads(resp.body)
         expected_data = [
             {
+                'report_yearmonth': '2024-08',
                 'account_creation_date': '2018-02',
                 'department': 'Safety "The Wolverine" Weapon X',
                 'embargoed_registration_count': 1,

diff --git a/api_tests/metrics/test_counted_usage.py b/api_tests/metrics/test_counted_usage.py
@@ -99,8 +99,8 @@ def test_by_client_session_id(self, app, mock_save, user):
         assert resp.status_code == 201
         assert_saved_with(
             mock_save,
-            # doc_id: sha256(b'http://example.foo/|http://example.foo/blahblah/blee|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3').hexdigest()
-            expected_doc_id='55fffffdc0d674d15a5e8763d14e4ae90f658fbfb6fbf94f88a5d24978f02e72',
+            # doc_id: sha256(b'http://example.foo/|http://example.foo/blahblah/blee|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|api,view').hexdigest()
+            expected_doc_id='3239044c7462dd318edd0522a0ed7d84b9c6502ef16cb40dfcae6c1f456d57a2',
             expected_attrs={
                 'platform_iri': 'http://example.foo/',
                 'item_guid': 'zyxwv',
@@ -132,8 +132,8 @@ def test_by_client_session_id_anon(self, app, mock_save):
         assert resp.status_code == 201
         assert_saved_with(
             mock_save,
-            # doc_id: sha256(b'http://example.foo/|http://example.foo/bliz/|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3').hexdigest()
-            expected_doc_id='e559ffbc4bd3e3e69252d34c273f0e771ec89ee455ec9b60fbbadf3944e4af4e',
+            # doc_id: sha256(b'http://example.foo/|http://example.foo/bliz/|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|view,web').hexdigest()
+            expected_doc_id='d01759e963893f9dc9b2ccf016a5ef29135673779802b5578f31449543677e82',
             expected_attrs={
                 'platform_iri': 'http://example.foo/',
                 'item_guid': 'zyxwv',
@@ -166,8 +166,8 @@ def test_by_user_auth(self, app, mock_save, user):
         assert resp.status_code == 201
         assert_saved_with(
             mock_save,
-            # doc_id: sha256(b'http://example.foo/|http://osf.io/mst3k|ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a|1981-01-01|3').hexdigest()
-            expected_doc_id='743494d8a55079b91e202da1dbdfce5aea72e310c57a34b36df2c2af5ed4d362',
+            # doc_id: sha256(b'http://example.foo/|http://osf.io/mst3k|ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a|1981-01-01|3|view,web').hexdigest()
+            expected_doc_id='7b8bc27c6d90fb45aa5bbd02deceba9f7384ed61b9a6e7253317c262020b94c2',
             expected_attrs={
                 'platform_iri': 'http://example.foo/',
                 'item_guid': 'yxwvu',
@@ -196,8 +196,8 @@ def test_by_useragent_header(self, app, mock_save):
         assert resp.status_code == 201
         assert_saved_with(
             mock_save,
-            # doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3').hexdigest()
-            expected_doc_id='a50ac1b2dc1c918cdea7be50b005117fdb6ee00ea069ca3aa4aaf03c0f905fa0',
+            # doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3|api,view').hexdigest()
+            expected_doc_id='d669528b30f443ffe506e183537af9624ef290090e90a200ecce7b7ca19c77f7',
             expected_attrs={
                 'platform_iri': 'http://example.foo/',
                 'item_guid': 'yxwvu',

diff --git a/osf/external/askismet/client.py b/osf/external/askismet/client.py
@@ -133,3 +133,37 @@ def submit_ham(self, user_ip, user_agent, **kwargs):
         )
         if res.status_code != requests.codes.ok:
             raise AkismetClientError(reason=res.text)
+
+    def get_flagged_count(self, start_date, end_date, category='node'):
+        from osf.models import NodeLog, PreprintLog
+
+        if category not in ['node', 'preprint']:
+            raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.")
+
+        log_model = NodeLog if category == 'node' else PreprintLog
+
+        flagged_count = log_model.objects.filter(
+            action=log_model.FLAG_SPAM,
+            created__gt=start_date,
+            created__lt=end_date,
+            **{f'{category}__spam_data__who_flagged__in': ['akismet', 'both']}
+        ).count()
+
+        return flagged_count
+
+    def get_hammed_count(self, start_date, end_date, category='node'):
+        from osf.models import NodeLog, PreprintLog
+
+        if category not in ['node', 'preprint']:
+            raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.")
+
+        log_model = NodeLog if category == 'node' else PreprintLog
+
+        hammed_count = log_model.objects.filter(
+            action=log_model.CONFIRM_HAM,
+            created__gt=start_date,
+            created__lt=end_date,
+            **{f'{category}__spam_data__who_flagged__in': ['akismet', 'both']}
+        ).count()
+
+        return hammed_count
diff --git a/osf/external/oopspam/client.py b/osf/external/oopspam/client.py
@@ -45,3 +45,37 @@ def check_content(self, user_ip, content, **kwargs):
 
         #  OOPSpam returns a spam score out of 6. 3 or higher indicates spam
         return spam_score >= settings.OOPSPAM_SPAM_LEVEL, resp_json
+
+    def get_flagged_count(self, start_date, end_date, category='node'):
+        from osf.models import NodeLog, PreprintLog
+
+        if category not in ['node', 'preprint']:
+            raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.")
+
+        log_model = NodeLog if category == 'node' else PreprintLog
+
+        flagged_count = log_model.objects.filter(
+            action=log_model.FLAG_SPAM,
+            created__gt=start_date,
+            created__lt=end_date,
+            **{f'{category}__spam_data__who_flagged__in': ['oopspam', 'both']}
+        ).count()
+
+        return flagged_count
+
+    def get_hammed_count(self, start_date, end_date, category='node'):
+        from osf.models import NodeLog, PreprintLog
+
+        if category not in ['node', 'preprint']:
+            raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.")
+
+        log_model = NodeLog if category == 'node' else PreprintLog
+
+        hammed_count = log_model.objects.filter(
+            action=log_model.CONFIRM_HAM,
+            created__gt=start_date,
+            created__lt=end_date,
+            **{f'{category}__spam_data__who_flagged__in': ['oopspam', 'both']}
+        ).count()
+
+        return hammed_count
diff --git a/osf/management/commands/migrate_preprint_affiliation.py b/osf/management/commands/migrate_preprint_affiliation.py
@@ -0,0 +1,118 @@
+import datetime
+import logging
+
+from django.core.management.base import BaseCommand
+from django.db import transaction
+from django.db.models import F, Exists, OuterRef
+
+from osf.models import PreprintContributor, InstitutionAffiliation
+
+logger = logging.getLogger(__name__)
+
+AFFILIATION_TARGET_DATE = datetime.datetime(2024, 9, 19, 14, 37, 48, tzinfo=datetime.timezone.utc)
+
+
+class Command(BaseCommand):
+    """Assign affiliations from users to preprints where they have write or admin permissions, with optional exclusion by user GUIDs."""
+
+    help = 'Assign affiliations from users to preprints where they have write or admin permissions.'
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--exclude-guids',
+            nargs='+',
+            dest='exclude_guids',
+            help='List of user GUIDs to exclude from affiliation assignment'
+        )
+        parser.add_argument(
+            '--dry-run',
+            action='store_true',
+            dest='dry_run',
+            help='If true, performs a dry run without making changes'
+        )
+        parser.add_argument(
+            '--batch-size',
+            type=int,
+            default=1000,
+            dest='batch_size',
+            help='Number of contributors to process in each batch'
+        )
+
+    def handle(self, *args, **options):
+        start_time = datetime.datetime.now()
+        logger.info(f'Script started at: {start_time}')
+
+        exclude_guids = set(options.get('exclude_guids') or [])
+        dry_run = options.get('dry_run', False)
+        batch_size = options.get('batch_size', 1000)
+
+        if dry_run:
+            logger.info('Dry run mode activated.')
+
+        processed_count, updated_count = assign_affiliations_to_preprints(
+            exclude_guids=exclude_guids,
+            dry_run=dry_run,
+            batch_size=batch_size
+        )
+
+        finish_time = datetime.datetime.now()
+        logger.info(f'Script finished at: {finish_time}')
+        logger.info(f'Total processed: {processed_count}, Updated: {updated_count}')
+        logger.info(f'Total run time: {finish_time - start_time}')
+
+
+def assign_affiliations_to_preprints(exclude_guids=None, dry_run=True, batch_size=1000):
+    exclude_guids = exclude_guids or set()
+    processed_count = updated_count = 0
+
+    # Subquery to check if the user has any affiliated institutions
+    user_has_affiliations = Exists(
+        InstitutionAffiliation.objects.filter(
+            user=OuterRef('user')
+        )
+    )
+
+    contributors_qs = PreprintContributor.objects.filter(
+        preprint__preprintgroupobjectpermission__permission__codename__in=['write_preprint'],
+        preprint__preprintgroupobjectpermission__group__user=F('user'),
+    ).filter(
+        user_has_affiliations
+    ).select_related(
+        'user',
+        'preprint'
+    ).exclude(
+        user__guids___id__in=exclude_guids
+    ).order_by('pk')  # Ensure consistent ordering for batching
+
+    total_contributors = contributors_qs.count()
+    logger.info(f'Total contributors to process: {total_contributors}')
+
+    # Process contributors in batches
+    with transaction.atomic():
+        for offset in range(0, total_contributors, batch_size):
+            # Use select_for_update() to ensure query hits the primary database
+            batch_contributors = contributors_qs[offset:offset + batch_size].select_for_update()
+
+            logger.info(f'Processing contributors {offset + 1} to {min(offset + batch_size, total_contributors)}')
+
+            for contributor in batch_contributors:
+                user = contributor.user
+                preprint = contributor.preprint
+
+                if preprint.created > AFFILIATION_TARGET_DATE:
+                    continue
+
+                user_institutions = user.get_affiliated_institutions()
+                processed_count += 1
+                if not dry_run:
+                    preprint.affiliated_institutions.add(*user_institutions)
+                    updated_count += 1
+                    logger.info(
+                        f'Assigned {len(user_institutions)} affiliations from user <{user._id}> to preprint <{preprint._id}>.'
+                    )
+                else:
+                    logger.info(
+                        f'Dry run: Would assign {len(user_institutions)} affiliations from user <{user._id}> to preprint <{preprint._id}>.'
+                    )
+
+    return processed_count, updated_count
diff --git a/osf/metrics/counted_usage.py b/osf/metrics/counted_usage.py
@@ -142,6 +142,7 @@ def _fill_document_id(counted_usage):
         counted_usage.session_id,
         counted_usage.timestamp.date(),
         time_window,
+        ','.join(sorted(counted_usage.action_labels)),
     )
 
 

diff --git a/osf/metrics/reporters/__init__.py b/osf/metrics/reporters/__init__.py
@@ -13,6 +13,7 @@
 from .public_item_usage import PublicItemUsageReporter
 from .user_count import UserCountReporter
 from .spam_count import SpamCountReporter
+from .private_spam_metrics import PrivateSpamMetricsReporter
 
 
 class AllDailyReporters(enum.Enum):
@@ -32,3 +33,4 @@ class AllMonthlyReporters(enum.Enum):
     INSTITUTIONAL_USERS = InstitutionalUsersReporter
     INSTITUTIONAL_SUMMARY = InstitutionalSummaryMonthlyReporter
     ITEM_USAGE = PublicItemUsageReporter
+    PRIVATE_SPAM_METRICS = PrivateSpamMetricsReporter