mediacloud · pgulley · Sep 5, 2024 · Jul 23, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/api.py b/api.py
@@ -249,6 +249,63 @@ def search_overview_via_payload(collection: Collection, req: Request, payload: Q
     return ES.search_overview(collection.name, payload.q)
 
 
+@v1.get("/{collection}/search/daily_counts", tags=["data"])
+@v1.head("/{collection}/search/daily_counts", include_in_schema=False)
+def search_daily_counts(collection: Collection, q: str, req: Request):
+    """
+    Report overview summary of the search result
+    """
+    return ES.daily_counts(collection.name, q)
+
+
+@v1.post("/{collection}/search/daily_counts", tags=["data"])
+def search_daily_counts_via_payload(
+    collection: Collection, req: Request, payload: Query
+):
+    """
+    Report summary of the search result
+    """
+    return ES.daily_counts(collection.name, payload.q)
+
+
+@v1.get("/{collection}/search/top_languages", tags=["data"])
+@v1.head("/{collection}/search/top_languages", include_in_schema=False)
+def search_top_languages(collection: Collection, q: str, req: Request):
+    """
+    Report overview summary of the search result
+    """
+    return ES.top_languages(collection.name, q)
+
+
+@v1.post("/{collection}/search/top_languages", tags=["data"])
+def search_top_languages_via_payload(
+    collection: Collection, req: Request, payload: Query
+):
+    """
+    Report summary of the search result
+    """
+    return ES.top_languages(collection.name, payload.q)
+
+
+@v1.get("/{collection}/search/top_domains", tags=["data"])
+@v1.head("/{collection}/search/top_domains", include_in_schema=False)
+def search_top_domains(collection: Collection, q: str, req: Request):
+    """
+    Report overview summary of the search result
+    """
+    return ES.top_domains(collection.name, q)
+
+
+@v1.post("/{collection}/search/top_domains", tags=["data"])
+def search_top_domains_via_payload(
+    collection: Collection, req: Request, payload: Query
+):
+    """
+    Report summary of the search result
+    """
+    return ES.top_domains(collection.name, payload.q)
+
+
 @v1.get("/{collection}/search/result", tags=["data"])
 @v1.head("/{collection}/search/result", include_in_schema=False)
 def search_result_via_query_params(

diff --git a/client.py b/client.py
@@ -43,6 +43,21 @@ class QueryBuilder:
     VALID_SORT_ORDERS = ["asc", "desc"]
     VALID_SORT_FIELDS = ["publication_date", "indexed_date"]
 
+    class Aggregators(Enum):
+        DAILY_COUNTS = {
+            "dailycounts": {
+                "date_histogram": {
+                    "field": "publication_date",
+                    "calendar_interval": "day",
+                    "min_doc_count": 1,
+                }
+            }
+        }
+        TOP_LANGS = {"toplangs": {"terms": {"field": "language.keyword", "size": 100}}}
+        TOP_DOMAINS = {
+            "topdomains": {"terms": {"field": "canonical_domain.keyword", "size": 100}}
+        }
+
     def __init__(self, query_text):
         self.query_text = query_text
         self._source = [
@@ -95,28 +110,17 @@ def basic_query(self, expanded: bool = False) -> Dict:
         }
         return default
 
-    def overview_query(self):
+    def aggregator_query(self, *aggs: "QueryBuilder.Aggregators") -> Dict:
         query = self.basic_query()
         query.update(
             {
-                "aggregations": {
-                    "daily": {
-                        "date_histogram": {
-                            "field": "publication_date",
-                            "calendar_interval": "day",
-                            "min_doc_count": 1,
-                        }
-                    },
-                    "lang": {"terms": {"field": "language.keyword", "size": 100}},
-                    "domain": {"terms": {"field": "canonical_domain", "size": 100}},
-                    "tld": {"terms": {"field": "tld", "size": 100}},
-                },
+                "aggregations": {k: v for agg in aggs for k, v in agg.value.items()},
                 "track_total_hits": True,
             }
         )
         return query
 
-    def terms_query(self, field):
+    def terms_query(self, field) -> Dict:
         resct = 200
         aggr_map = {
             "terms": {
@@ -170,7 +174,7 @@ def paged_query(
             query["search_after"] = [decode_key(resume)]
         return query
 
-    def article_query(self):
+    def article_query(self) -> Dict:
         default: dict = {
             "_source": self._expanded_source,
             "query": {"match": {"_id": self.query_text}},
@@ -249,30 +253,86 @@ def format_day_counts(self, bucket: list):
     def format_counts(self, bucket: list):
         return {item["key"]: item["doc_count"] for item in bucket}
 
-    def search_overview(self, collection: str, q: str):
+    def aggregator_query(
+        self, collection: str, q: str, *aggs: QueryBuilder.Aggregators, **options
+    ):
         """
-        Get overview statistics for a query
+        Abstraction to DRY out permutations of the 'overview' query getting broken out into their own calls
         """
-        res = self.ES.search(index=collection, body=QueryBuilder(q).overview_query())  # type: ignore [call-arg]
+        query_body = QueryBuilder(q).aggregator_query(*aggs)
+
+        res = self.ES.search(index=collection, body=query_body)  # type: ignore [call-arg]
         if not res["hits"]["hits"]:
             raise HTTPException(status_code=404, detail="No results found!")
 
         total = res["hits"]["total"]["value"]
-        tldsum = sum(
-            item["doc_count"] for item in res["aggregations"]["tld"]["buckets"]
-        )
-        return {
+        return_dict = {
             "query": q,
-            "total": max(total, tldsum),
-            "topdomains": self.format_counts(res["aggregations"]["domain"]["buckets"]),
-            "toptlds": self.format_counts(res["aggregations"]["tld"]["buckets"]),
-            "toplangs": self.format_counts(res["aggregations"]["lang"]["buckets"]),
-            "dailycounts": self.format_day_counts(
-                res["aggregations"]["daily"]["buckets"]
-            ),
-            "matches": [self.format_match(h, collection) for h in res["hits"]["hits"]],
         }
 
+        # Add the results of each aggregator to the return value
+        for agg in aggs:
+            agg_name = next(iter(agg.value.keys()))
+            return_dict.update(
+                {agg_name: self.format_counts(res["aggregations"][agg_name]["buckets"])}
+            )
+
+        # Only return the total and matches if explicitly requested
+        if "overview" in options:
+            return_dict.update(
+                {
+                    "total": total,
+                    "matches": [  # type: ignore [dict-item]
+                        self.format_match(h, collection) for h in res["hits"]["hits"]
+                    ],
+                }
+            )
+
+        return return_dict
+
+    def search_overview(self, collection: str, q: str):
+        """
+        Get overview statistics for a query
+        """
+        return self.aggregator_query(
+            collection,
+            q,
+            QueryBuilder.Aggregators.DAILY_COUNTS,
+            QueryBuilder.Aggregators.TOP_LANGS,
+            QueryBuilder.Aggregators.TOP_DOMAINS,
+            overview=True,
+        )
+
+    def daily_counts(self, collection: str, q: str):
+        """
+        Return just a daily count histogram for a query
+        """
+        return self.aggregator_query(
+            collection,
+            q,
+            QueryBuilder.Aggregators.DAILY_COUNTS,
+        )
+
+    def top_languages(self, collection: str, q: str):
+        """
+        Return top languagues for a query
+        """
+        return self.aggregator_query(
+            collection,
+            q,
+            QueryBuilder.Aggregators.TOP_LANGS,
+        )
+
+    def top_domains(self, collection: str, q: str):
+        """
+        Return top domains for a query
+        """
+        return self.aggregator_query(
+            collection,
+            q,
+            QueryBuilder.Aggregators.TOP_DOMAINS,
+        )
+
     def search_result(
         self,
         collection: str,

diff --git a/test/api_test.py b/test/api_test.py
@@ -364,3 +364,36 @@ def test_top_terms(self):
         results = response.json()
         assert response.status_code == 200
         assert len(results) > 0
+
+    def test_daily_counts(self):
+        response = self._client.post(
+            f"/v1/{INDEX_NAME}/search/daily_counts",
+            json={"q": "mediacloud"},
+            timeout=TIMEOUT,
+        )
+
+        results = response.json()
+        assert response.status_code == 200
+        assert "dailycounts" in results
+
+    def test_top_languages(self):
+        response = self._client.post(
+            f"/v1/{INDEX_NAME}/search/top_languages",
+            json={"q": "mediacloud"},
+            timeout=TIMEOUT,
+        )
+
+        results = response.json()
+        assert response.status_code == 200
+        assert "toplangs" in results
+
+    def test_top_domains(self):
+        response = self._client.post(
+            f"/v1/{INDEX_NAME}/search/top_domains",
+            json={"q": "mediacloud"},
+            timeout=TIMEOUT,
+        )
+
+        results = response.json()
+        assert response.status_code == 200
+        assert "topdomains" in results