Add class TuringClassificationSelenium to automate download of classi…

…fications and scores from turing
dmf-unicatt · Aug 22, 2024 · 2257e87 · 2257e87
1 parent 7753371
commit 2257e87
Show file tree

Hide file tree

Showing 4 changed files with 461 additions and 1 deletion.
diff --git a/mathrace_interaction/mathrace_interaction/network/__init__.py b/mathrace_interaction/mathrace_interaction/network/__init__.py
@@ -7,3 +7,4 @@
 
 from mathrace_interaction.network.get_ssh_client import get_ssh_client
 from mathrace_interaction.network.open_file_on_ssh_host import open_file_on_ssh_host
+from mathrace_interaction.network.turing_classification_selenium import TuringClassificationSelenium
diff --git a/mathrace_interaction/mathrace_interaction/network/turing_classification_selenium.py b/mathrace_interaction/mathrace_interaction/network/turing_classification_selenium.py
@@ -0,0 +1,153 @@
+# Copyright (C) 2024 by the Turing @ DMF authors
+#
+# This file is part of Turing @ DMF.
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""A selenium browser that connects to a classification page on the current live turing instance."""
+
+import urllib.parse
+
+import bs4
+import requests
+import selenium.webdriver
+import selenium.webdriver.common.by
+import selenium.webdriver.support.expected_conditions as EC  # noqa: N812
+import selenium.webdriver.support.ui
+
+
+class TuringClassificationSelenium:
+    """
+    A selenium browser that connects to a classification page on the current live turing instance.
+
+    Parameters
+    ----------
+    root_url
+        URL of the root of the turing website.
+    race_id
+        The ID of the turing race to follow.
+    max_wait
+        Maximum amount to wait in seconds for the requested page to load fully.
+
+    Attributes
+    ----------
+    _browser
+        The selenium browser that will be used to connect to the website.
+    _root_url
+        URL of the root of the turing website.
+    _race_id
+        The ID of the turing race to follow.
+    _max_wait
+        Maximum amount to wait in seconds for the requested page to load fully.
+    """
+
+    def __init__(self, root_url: str, race_id: int, max_wait: int) -> None:
+        options = selenium.webdriver.ChromeOptions()
+        options.add_argument("--no-sandbox")  # type: ignore[no-untyped-call]
+        options.add_argument("--window-size=1920,1080")  # type: ignore[no-untyped-call]
+        options.add_argument("--headless")  # type: ignore[no-untyped-call]
+        options.add_argument("--disable-gpu")  # type: ignore[no-untyped-call]
+        self._browser = selenium.webdriver.Chrome(options=options)
+        self._root_url = root_url
+        self._race_id = race_id
+        self._max_wait = max_wait
+        self._can_compute_team_scores = False
+
+    def _wait_for_element(self, by: str, value: str) -> None:
+        selenium.webdriver.support.wait.WebDriverWait(self._browser, self._max_wait).until(
+            EC.presence_of_element_located((by, value)))  # type: ignore[no-untyped-call]
+
+    def _wait_for_classification_computed(self) -> None:
+        selenium.webdriver.support.wait.WebDriverWait(self._browser, self._max_wait).until(
+            JavascriptVariableEvaluatesToTrue("document.updated"))
+
+    def login(self, username: str, password: str) -> None:
+        """Log into the turing instance with the provided credentials."""
+        self._browser.get(urllib.parse.urljoin(self._root_url, "accounts/login"))
+        # Wait for the login button to appear, and send credentials
+        self._wait_for_element(selenium.webdriver.common.by.By.ID, "submit")
+        self._browser.find_element(selenium.webdriver.common.by.By.NAME, "username").send_keys(username)
+        self._browser.find_element(selenium.webdriver.common.by.By.NAME, "password").send_keys(password)
+        self._browser.find_element(selenium.webdriver.common.by.By.ID, "submit").click()
+        # Successful login redirects to the home page, where there is a button "Crea una nuova gara"
+        self._wait_for_element(selenium.webdriver.common.by.By.CSS_SELECTOR, "a[href='/engine/gara/new']")
+
+    def go_to_classification_page(self, classification_type: str, querystring: dict[str, str]) -> None:
+        """Direct the browser to visit a specific classification type."""
+        self._browser.get(
+            urllib.parse.urljoin(self._root_url, f"engine/classifica/{self._race_id}/{classification_type}"))
+        # Wait for the classification to be fully computed
+        self._wait_for_classification_computed()
+        # Only certain classification views can compute team scores
+        self._can_compute_team_scores = (classification_type == "squadre")
+
+    def get_team_scores(self) -> list[int]:
+        """Get the scores of the teams in the race."""
+        assert self._can_compute_team_scores
+        team_id = 1
+        scores = []
+        while True:
+            score_elements = self._browser.find_elements(
+                selenium.webdriver.common.by.By.ID, f"label-points-{team_id}")
+            if len(score_elements) == 0:
+                break
+            else:
+                scores.append(int(score_elements[0].text))
+                team_id += 1
+        return scores
+
+    def get_css_sources(self) -> dict[str, str]:
+        """Get the content of CSS files used in the current page."""
+        soup = bs4.BeautifulSoup(self._browser.page_source, "html.parser")
+        all_css = dict()
+
+        for css in soup.find_all("link", rel="stylesheet"):
+            # Do not use the current selenium browser to fetch the css content, otherwise
+            # the browser would move away from the current page. However, since css content
+            # is static, simply downloading the page via the python package requests suffices.
+            response = requests.get(urllib.parse.urljoin(self._root_url, css["href"]))
+            assert response.status_code == 200
+            filename = css["href"].split("/")[-1]
+            assert filename not in all_css, "Cannot have to css files with the same name"
+            all_css[filename] = response.text
+
+        return all_css
+
+    def get_html_source(self) -> str:
+        """
+        Get the HTML source code of a page of the turing instance for local download.
+
+        The HTML code is preprocessed as follows:
+        - the path of any css should be flattened to the one returned by get_css_sources.
+        - any local link to the live instance is removed, since it would not be available locally.
+        - any javascript is removed, since in order to be visible locally the page cannot contain
+          any script that requires the live server.
+        """
+        soup = bs4.BeautifulSoup(self._browser.page_source, "html.parser")
+
+        # Flatten css path
+        for css in soup.find_all("link", rel="stylesheet"):
+            css["href"] = css["href"].split("/")[-1]
+
+        # Remove local links
+        for a in soup.select("a[href]"):
+            assert isinstance(a["href"], str)
+            if a["href"].startswith("/"):
+                del a["href"]
+
+        # Remove <script> tags
+        for script in soup.select("script"):
+            script.decompose()
+
+        # Return postprocessed page
+        return str(soup)
+
+
+class JavascriptVariableEvaluatesToTrue:
+    """Helper class used to wait until a javascript variable is true."""
+
+    def __init__(self, variable: str) -> None:
+        self._variable = variable
+
+    def __call__(self, driver: selenium.webdriver.Chrome) -> bool:
+        """Condition for waiting until the javascript variable is true."""
+        return driver.execute_script(f"return {self._variable};")  # type: ignore[no-any-return, no-untyped-call]
diff --git a/mathrace_interaction/pyproject.toml b/mathrace_interaction/pyproject.toml
@@ -29,8 +29,11 @@ classifiers = [
     "Topic :: Software Development :: Libraries :: Python Modules"
 ]
 dependencies = [
+    "bs4",
     "jsondiff",
-    "paramiko"
+    "paramiko",
+    "requests",
+    "selenium"
 ]
 
 [project.urls]
@@ -47,7 +50,9 @@ lint = [
     "isort",
     "mypy",
     "ruff",
+    "types-beautifulsoup4",
     "types-paramiko",
+    "types-requests",
     "yamllint"
 ]
 tests = [
@@ -56,6 +61,7 @@ tests = [
     "pycryptodomex",
     "pytest",
     "pytest-django",
+    "pytest_httpserver",
     "pytest-random-order"
 ]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -7,3 +7,4 @@

		from mathrace_interaction.network.get_ssh_client import get_ssh_client
		from mathrace_interaction.network.open_file_on_ssh_host import open_file_on_ssh_host
		from mathrace_interaction.network.turing_classification_selenium import TuringClassificationSelenium