mims-harvard · amva13 · May 8, 2024 · May 7, 2024 · May 8, 2024 · May 8, 2024
diff --git a/run_tests.py b/run_tests.py
@@ -1,6 +1,7 @@
 import unittest
 import sys
 
+
 if __name__ == '__main__':
     loader = unittest.TestLoader()
     start_dir = 'tdc/test'

diff --git a/tdc/benchmark_group/scdti_group.py b/tdc/benchmark_group/scdti_group.py
@@ -47,3 +47,20 @@ def evaluate(self, y_pred):
         accuracy = accuracy_score(y_true, y_pred)
         f1 = f1_score(y_true, y_pred)
         return [precision, recall, accuracy, f1]
+
+    def evaluate_many(self, preds):
+        from numpy import mean, std
+        if len(preds) < 5:
+            raise Exception(
+                "Run your model on at least 5 seeds to compare results and provide your outputs in preds."
+            )
+        out = dict()
+        preds = [self.evaluate(p) for p in preds]
+        out["precision"] = (mean([x[0] for x in preds]),
+                            std([x[0] for x in preds]))
+        out["recall"] = (mean([x[1] for x in preds]), std([x[1] for x in preds
+                                                          ]))
+        out["accuracy"] = (mean([x[2] for x in preds]),
+                           std([x[2] for x in preds]))
+        out["f1"] = (mean([x[3] for x in preds]), std([x[3] for x in preds]))
+        return out
diff --git a/tdc/test/test_benchmark.py b/tdc/test/test_benchmark.py
@@ -80,6 +80,10 @@ def test_SCDTI_benchmark(self):
         zero_pred = [0] * len(y_true)
         results = group.evaluate(zero_pred)
         assert results[-1] != 1.0  # should not be perfect F1 score
+        many_results = group.evaluate_many([y_true] * 5)
+        assert "f1" in many_results
+        assert len(many_results["f1"]
+                  ) == 2  # should include mean and standard deviation
 
 
 if __name__ == "__main__":