From 2257e871179f73061766cf4f063d024e4c0c99b0 Mon Sep 17 00:00:00 2001
From: Francesco Ballarin <francesco.ballarin@unicatt.it>
Date: Thu, 22 Aug 2024 18:27:08 +0200
Subject: [PATCH] Add class TuringClassificationSelenium to automate download
 of classifications and scores from turing

---
 .../mathrace_interaction/network/__init__.py  |   1 +
 .../network/turing_classification_selenium.py | 153 +++++++++
 mathrace_interaction/pyproject.toml           |   8 +-
 .../test_turing_classification_selenium.py    | 300 ++++++++++++++++++
 4 files changed, 461 insertions(+), 1 deletion(-)
 create mode 100644 mathrace_interaction/mathrace_interaction/network/turing_classification_selenium.py
 create mode 100644 mathrace_interaction/tests/unit/network/test_turing_classification_selenium.py

diff --git a/mathrace_interaction/mathrace_interaction/network/__init__.py b/mathrace_interaction/mathrace_interaction/network/__init__.py
index b1845cb..218384a 100644
--- a/mathrace_interaction/mathrace_interaction/network/__init__.py
+++ b/mathrace_interaction/mathrace_interaction/network/__init__.py
@@ -7,3 +7,4 @@
 
 from mathrace_interaction.network.get_ssh_client import get_ssh_client
 from mathrace_interaction.network.open_file_on_ssh_host import open_file_on_ssh_host
+from mathrace_interaction.network.turing_classification_selenium import TuringClassificationSelenium
diff --git a/mathrace_interaction/mathrace_interaction/network/turing_classification_selenium.py b/mathrace_interaction/mathrace_interaction/network/turing_classification_selenium.py
new file mode 100644
index 0000000..a14e717
--- /dev/null
+++ b/mathrace_interaction/mathrace_interaction/network/turing_classification_selenium.py
@@ -0,0 +1,153 @@
+# Copyright (C) 2024 by the Turing @ DMF authors
+#
+# This file is part of Turing @ DMF.
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""A selenium browser that connects to a classification page on the current live turing instance."""
+
+import urllib.parse
+
+import bs4
+import requests
+import selenium.webdriver
+import selenium.webdriver.common.by
+import selenium.webdriver.support.expected_conditions as EC  # noqa: N812
+import selenium.webdriver.support.ui
+
+
+class TuringClassificationSelenium:
+    """
+    A selenium browser that connects to a classification page on the current live turing instance.
+
+    Parameters
+    ----------
+    root_url
+        URL of the root of the turing website.
+    race_id
+        The ID of the turing race to follow.
+    max_wait
+        Maximum amount to wait in seconds for the requested page to load fully.
+
+    Attributes
+    ----------
+    _browser
+        The selenium browser that will be used to connect to the website.
+    _root_url
+        URL of the root of the turing website.
+    _race_id
+        The ID of the turing race to follow.
+    _max_wait
+        Maximum amount to wait in seconds for the requested page to load fully.
+    """
+
+    def __init__(self, root_url: str, race_id: int, max_wait: int) -> None:
+        options = selenium.webdriver.ChromeOptions()
+        options.add_argument("--no-sandbox")  # type: ignore[no-untyped-call]
+        options.add_argument("--window-size=1920,1080")  # type: ignore[no-untyped-call]
+        options.add_argument("--headless")  # type: ignore[no-untyped-call]
+        options.add_argument("--disable-gpu")  # type: ignore[no-untyped-call]
+        self._browser = selenium.webdriver.Chrome(options=options)
+        self._root_url = root_url
+        self._race_id = race_id
+        self._max_wait = max_wait
+        self._can_compute_team_scores = False
+
+    def _wait_for_element(self, by: str, value: str) -> None:
+        selenium.webdriver.support.wait.WebDriverWait(self._browser, self._max_wait).until(
+            EC.presence_of_element_located((by, value)))  # type: ignore[no-untyped-call]
+
+    def _wait_for_classification_computed(self) -> None:
+        selenium.webdriver.support.wait.WebDriverWait(self._browser, self._max_wait).until(
+            JavascriptVariableEvaluatesToTrue("document.updated"))
+
+    def login(self, username: str, password: str) -> None:
+        """Log into the turing instance with the provided credentials."""
+        self._browser.get(urllib.parse.urljoin(self._root_url, "accounts/login"))
+        # Wait for the login button to appear, and send credentials
+        self._wait_for_element(selenium.webdriver.common.by.By.ID, "submit")
+        self._browser.find_element(selenium.webdriver.common.by.By.NAME, "username").send_keys(username)
+        self._browser.find_element(selenium.webdriver.common.by.By.NAME, "password").send_keys(password)
+        self._browser.find_element(selenium.webdriver.common.by.By.ID, "submit").click()
+        # Successful login redirects to the home page, where there is a button "Crea una nuova gara"
+        self._wait_for_element(selenium.webdriver.common.by.By.CSS_SELECTOR, "a[href='/engine/gara/new']")
+
+    def go_to_classification_page(self, classification_type: str, querystring: dict[str, str]) -> None:
+        """Direct the browser to visit a specific classification type."""
+        self._browser.get(
+            urllib.parse.urljoin(self._root_url, f"engine/classifica/{self._race_id}/{classification_type}"))
+        # Wait for the classification to be fully computed
+        self._wait_for_classification_computed()
+        # Only certain classification views can compute team scores
+        self._can_compute_team_scores = (classification_type == "squadre")
+
+    def get_team_scores(self) -> list[int]:
+        """Get the scores of the teams in the race."""
+        assert self._can_compute_team_scores
+        team_id = 1
+        scores = []
+        while True:
+            score_elements = self._browser.find_elements(
+                selenium.webdriver.common.by.By.ID, f"label-points-{team_id}")
+            if len(score_elements) == 0:
+                break
+            else:
+                scores.append(int(score_elements[0].text))
+                team_id += 1
+        return scores
+
+    def get_css_sources(self) -> dict[str, str]:
+        """Get the content of CSS files used in the current page."""
+        soup = bs4.BeautifulSoup(self._browser.page_source, "html.parser")
+        all_css = dict()
+
+        for css in soup.find_all("link", rel="stylesheet"):
+            # Do not use the current selenium browser to fetch the css content, otherwise
+            # the browser would move away from the current page. However, since css content
+            # is static, simply downloading the page via the python package requests suffices.
+            response = requests.get(urllib.parse.urljoin(self._root_url, css["href"]))
+            assert response.status_code == 200
+            filename = css["href"].split("/")[-1]
+            assert filename not in all_css, "Cannot have to css files with the same name"
+            all_css[filename] = response.text
+
+        return all_css
+
+    def get_html_source(self) -> str:
+        """
+        Get the HTML source code of a page of the turing instance for local download.
+
+        The HTML code is preprocessed as follows:
+        - the path of any css should be flattened to the one returned by get_css_sources.
+        - any local link to the live instance is removed, since it would not be available locally.
+        - any javascript is removed, since in order to be visible locally the page cannot contain
+          any script that requires the live server.
+        """
+        soup = bs4.BeautifulSoup(self._browser.page_source, "html.parser")
+
+        # Flatten css path
+        for css in soup.find_all("link", rel="stylesheet"):
+            css["href"] = css["href"].split("/")[-1]
+
+        # Remove local links
+        for a in soup.select("a[href]"):
+            assert isinstance(a["href"], str)
+            if a["href"].startswith("/"):
+                del a["href"]
+
+        # Remove <script> tags
+        for script in soup.select("script"):
+            script.decompose()
+
+        # Return postprocessed page
+        return str(soup)
+
+
+class JavascriptVariableEvaluatesToTrue:
+    """Helper class used to wait until a javascript variable is true."""
+
+    def __init__(self, variable: str) -> None:
+        self._variable = variable
+
+    def __call__(self, driver: selenium.webdriver.Chrome) -> bool:
+        """Condition for waiting until the javascript variable is true."""
+        return driver.execute_script(f"return {self._variable};")  # type: ignore[no-any-return, no-untyped-call]
diff --git a/mathrace_interaction/pyproject.toml b/mathrace_interaction/pyproject.toml
index 30ccd7f..ce07e5a 100644
--- a/mathrace_interaction/pyproject.toml
+++ b/mathrace_interaction/pyproject.toml
@@ -29,8 +29,11 @@ classifiers = [
     "Topic :: Software Development :: Libraries :: Python Modules"
 ]
 dependencies = [
+    "bs4",
     "jsondiff",
-    "paramiko"
+    "paramiko",
+    "requests",
+    "selenium"
 ]
 
 [project.urls]
@@ -47,7 +50,9 @@ lint = [
     "isort",
     "mypy",
     "ruff",
+    "types-beautifulsoup4",
     "types-paramiko",
+    "types-requests",
     "yamllint"
 ]
 tests = [
@@ -56,6 +61,7 @@ tests = [
     "pycryptodomex",
     "pytest",
     "pytest-django",
+    "pytest_httpserver",
     "pytest-random-order"
 ]
 
diff --git a/mathrace_interaction/tests/unit/network/test_turing_classification_selenium.py b/mathrace_interaction/tests/unit/network/test_turing_classification_selenium.py
new file mode 100644
index 0000000..f810a44
--- /dev/null
+++ b/mathrace_interaction/tests/unit/network/test_turing_classification_selenium.py
@@ -0,0 +1,300 @@
+# Copyright (C) 2024 by the Turing @ DMF authors
+#
+# This file is part of Turing @ DMF.
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Test mathrace_interaction.network.TuringClassificationSelenium."""
+
+import pytest_httpserver
+import selenium.webdriver.common.by
+import selenium.webdriver.support.color
+
+import mathrace_interaction.network
+
+
+class Browser(mathrace_interaction.network.TuringClassificationSelenium):
+    """Helper class that extends TuringClassificationSelenium on the URL of the mock httpserver."""
+
+    def __init__(self, httpserver: pytest_httpserver.HTTPServer) -> None:
+        super().__init__(httpserver.url_for("/"), 0, 1)
+
+
+def test_browser_response(httpserver: pytest_httpserver.HTTPServer) -> None:
+    """Test a simple response by the browser."""
+    httpserver.expect_request("/").respond_with_data("Hello world!", content_type="text/plain")
+
+    selenium = Browser(httpserver)._browser
+    selenium.get(httpserver.url_for("/"))
+    assert "Hello world!" in selenium.page_source
+
+
+def test_classification_browser_login(httpserver: pytest_httpserver.HTTPServer) -> None:
+    """Test mathrace_interaction.network.TuringClassificationSelenium.login."""
+    login_page = f"""<html>
+<body>
+<form action="{httpserver.url_for("/engine")}">
+    <input type="text" name="username">
+    <input type="password" name="password">
+    <input type="submit" id="submit" value="Login">
+</form>
+</body>
+</html>"""
+    post_login_page = """<html>
+<body>
+<span id="qs">Processing querystring</span>
+<script>
+const params = new URLSearchParams(window.location.search);
+document.getElementById("qs").textContent =
+    "username is " + params.get("username") + " and password is " + params.get("password");
+</script>
+<a href="/engine/gara/new">Crea nuova gara</a>
+</body>
+</html>"""
+    httpserver.expect_request("/accounts/login").respond_with_data(login_page, content_type="text/html")
+    httpserver.expect_request("/engine").respond_with_data(post_login_page, content_type="text/html")
+
+    browser = Browser(httpserver)
+    browser.login("admin", "secret")
+    assert "Processing querystring" not in browser._browser.page_source
+    assert "username is admin and password is secret" in browser._browser.page_source
+    assert "Crea nuova gara" in browser._browser.page_source
+
+
+def test_classification_browser_go_to_classification_page(httpserver: pytest_httpserver.HTTPServer) -> None:
+    """Test mathrace_interaction.network.TuringClassificationSelenium.go_to_classification_page."""
+    classification_page = """<html>
+<body>
+<span id="qs">Processing classification</span>
+<script>
+document.updated = false;
+setTimeout(function(){
+    document.getElementById("qs").textContent = "Classification has " + "been computed";
+    document.updated = true;
+}, 10);
+</script>
+</html>"""
+    httpserver.expect_request("/engine/classifica/0/unica").respond_with_data(
+        classification_page, content_type="text/html")
+
+    browser = Browser(httpserver)
+    browser.go_to_classification_page("unica", {})
+    assert "Processing classification" not in browser._browser.page_source
+    assert "Classification has been computed" in browser._browser.page_source
+
+
+def test_classification_browser_get_team_scores(httpserver: pytest_httpserver.HTTPServer) -> None:
+    """Test mathrace_interaction.network.TuringClassificationSelenium.get_team_scores."""
+    classification_page = """<html>
+<body>
+<script>
+document.updated = false;
+setTimeout(function(){
+    document.updated = true;
+}, 10);
+
+var current_time = 0;
+function incrementScores() {
+    current_time += 1;
+    document.getElementById("label-points-1").textContent = current_time.toString();
+    document.getElementById("label-points-2").textContent = (1 + current_time * 2).toString();
+    document.getElementById("label-points-3").textContent = (2 + current_time * 3).toString();
+}
+</script>
+Team 1: <span id="label-points-1">0</span>
+Team 2: <span id="label-points-2">1</span>
+Team 3: <span id="label-points-3">2</span>
+<button id="increment" onclick="incrementScores()">Increment scores</button>
+</body>
+</html>"""
+    httpserver.expect_request("/engine/classifica/0/squadre").respond_with_data(
+        classification_page, content_type="text/html")
+
+    browser = Browser(httpserver)
+    browser.go_to_classification_page("squadre", {})
+    assert 'Team 1: <span id="label-points-1">0</span>' in browser._browser.page_source
+    assert 'Team 2: <span id="label-points-2">1</span>' in browser._browser.page_source
+    assert 'Team 3: <span id="label-points-3">2</span>' in browser._browser.page_source
+
+    # Click on the button once to check that it works correctly
+    browser._browser.find_element(selenium.webdriver.common.by.By.ID, "increment").click()
+    assert 'Team 1: <span id="label-points-1">1</span>' in browser._browser.page_source
+    assert 'Team 2: <span id="label-points-2">3</span>' in browser._browser.page_source
+    assert 'Team 3: <span id="label-points-3">5</span>' in browser._browser.page_source
+
+    # Get the computed scores, and compare them with the expected ones
+    computed_scores = browser.get_team_scores()
+    assert computed_scores == [1, 3, 5]
+
+    # Increment again the scores
+    browser._browser.find_element(selenium.webdriver.common.by.By.ID, "increment").click()
+    assert 'Team 1: <span id="label-points-1">2</span>' in browser._browser.page_source
+    assert 'Team 2: <span id="label-points-2">5</span>' in browser._browser.page_source
+    assert 'Team 3: <span id="label-points-3">8</span>' in browser._browser.page_source
+
+    # Get the computed scores, and compare them with the expected ones
+    computed_scores = browser.get_team_scores()
+    assert computed_scores == [2, 5, 8]
+
+
+def test_classification_browser_get_css_sources(httpserver: pytest_httpserver.HTTPServer) -> None:
+    """Test mathrace_interaction.network.TuringClassificationSelenium.get_css_sources."""
+    index_page = """<html>
+<head>
+    <link href="/folder1/style1.css" rel="stylesheet" type="text/css">
+    <link href="/folder2/subfolder2/style2.css" rel="stylesheet" type="text/css">
+</head>
+<body>
+<div id="container"><span id="text">Hello world!</span></div>
+</body>
+</html>"""
+    style1_css = """div {
+  background-color: #ff0000;
+}"""
+    style2_css = """span {
+  color: #00ffff;
+}"""
+
+    httpserver.expect_request("/").respond_with_data(index_page, content_type="text/html")
+    httpserver.expect_request("/folder1/style1.css").respond_with_data(style1_css, content_type="text/css")
+    httpserver.expect_request("/folder2/subfolder2/style2.css").respond_with_data(style2_css, content_type="text/css")
+
+    browser = Browser(httpserver)
+    browser._browser.get(httpserver.url_for("/"))
+    assert "Hello world!" in browser._browser.page_source
+
+    # Ensure that background color and text color are as expected
+    container = browser._browser.find_element(selenium.webdriver.common.by.By.ID, "container")
+    bg_color_rgba = container.value_of_css_property("background-color")
+    assert selenium.webdriver.support.color.Color.from_string(bg_color_rgba).hex == "#ff0000"
+    text = browser._browser.find_element(selenium.webdriver.common.by.By.ID, "text")
+    color_rgba = text.value_of_css_property("color")
+    assert selenium.webdriver.support.color.Color.from_string(color_rgba).hex == "#00ffff"
+
+    # Get a dictionary containing the sources of all CSS files
+    all_css = browser.get_css_sources()
+    assert len(all_css) == 2
+    assert "style1.css" in all_css
+    assert "style2.css" in all_css
+    assert all_css["style1.css"] == style1_css
+    assert all_css["style2.css"] == style2_css
+
+
+def test_classification_browser_get_html_source_replaces_css(httpserver: pytest_httpserver.HTTPServer) -> None:
+    """Test mathrace_interaction.network.TuringClassificationSelenium.get_html_source replaces css file names."""
+    index_page = """<html>
+<head>
+    <link href="/folder1/style1.css" rel="stylesheet" type="text/css">
+    <link href="/folder2/subfolder2/style2.css" rel="stylesheet" type="text/css">
+</head>
+<body>
+<div id="container"><span id="text">Hello world!</span></div>
+</body>
+</html>"""
+    style1_css = """div {
+  background-color: #ff0000;
+}"""
+    style2_css = """span {
+  color: #00ffff;
+}"""
+
+    httpserver.expect_request("/").respond_with_data(index_page, content_type="text/html")
+    httpserver.expect_request("/folder1/style1.css").respond_with_data(style1_css, content_type="text/css")
+    httpserver.expect_request("/folder2/subfolder2/style2.css").respond_with_data(style2_css, content_type="text/css")
+
+    browser = Browser(httpserver)
+    browser._browser.get(httpserver.url_for("/"))
+    assert "Hello world!" in browser._browser.page_source
+    assert '"/folder1/style1.css"' in browser._browser.page_source
+    assert '"/folder2/subfolder2/style2.css"' in browser._browser.page_source
+
+    # Get the postprocessed HTML source, and check that it contains the flattened css names
+    postprocessed_source = browser.get_html_source()
+    assert '"style1.css"' in postprocessed_source
+    assert '"style2.css"' in postprocessed_source
+    assert '"/folder1/style1.css"' not in postprocessed_source
+    assert '"/folder2/subfolder2/style2.css"' not in postprocessed_source
+
+    # Ensure that background color and text color are as expected, i.e. changing the file names
+    # in the postprocessed source did not affect the live page.
+    container = browser._browser.find_element(selenium.webdriver.common.by.By.ID, "container")
+    bg_color_rgba = container.value_of_css_property("background-color")
+    assert selenium.webdriver.support.color.Color.from_string(bg_color_rgba).hex == "#ff0000"
+    text = browser._browser.find_element(selenium.webdriver.common.by.By.ID, "text")
+    color_rgba = text.value_of_css_property("color")
+    assert selenium.webdriver.support.color.Color.from_string(color_rgba).hex == "#00ffff"
+
+
+def test_classification_browser_get_html_source_strips_links(httpserver: pytest_httpserver.HTTPServer) -> None:
+    """Test that mathrace_interaction.network.TuringClassificationSelenium.get_html_source strips links from output."""
+    start_page = """<html>
+<body>
+<a href="/redirect" id="link">Redirect</a>
+</body>
+</html>"""
+    redirect_page = """<html>
+<body>
+Success!
+</body>
+</html>"""
+    httpserver.expect_request("/").respond_with_data(start_page, content_type="text/html")
+    httpserver.expect_request("/redirect").respond_with_data(redirect_page, content_type="text/html")
+
+    browser = Browser(httpserver)
+    browser._browser.get(httpserver.url_for("/"))
+    assert "Redirect" in browser._browser.page_source
+    assert "/redirect" in browser._browser.page_source
+    assert "<a href" in browser._browser.page_source
+
+    # Get the postprocessed HTML source, and check that it does not have any href attribute
+    postprocessed_source = browser.get_html_source()
+    assert "Redirect" in postprocessed_source
+    assert "/redirect" not in postprocessed_source
+    assert "<a " in postprocessed_source
+    assert "href" not in postprocessed_source
+
+    # Click on the link to check that it works, i.e. removing the href
+    # from the postprocessed source did not affect the live page.
+    browser._browser.find_element(selenium.webdriver.common.by.By.ID, "link").click()
+    assert "Success!" in browser._browser.page_source
+
+
+def test_classification_browser_get_html_source_strips_javascript(httpserver: pytest_httpserver.HTTPServer) -> None:
+    """Test that mathrace_interaction.network.TuringClassificationSelenium.get_html_source strips js from output."""
+    javascript_button_page = """<html>
+<body>
+<script>
+function incrementTime() {
+    const prefix = "Current time is ";
+    var current_time = parseInt(document.getElementById("time").textContent.replace(prefix, ""));
+    document.getElementById("time").textContent = prefix + (current_time + 1).toString();
+}
+</script>
+<span id="time">Current time is 0</span>
+<button id="increment" onclick="incrementTime()">Increment time</button>
+</body>
+</html>"""
+    httpserver.expect_request("/").respond_with_data(javascript_button_page, content_type="text/html")
+
+    browser = Browser(httpserver)
+    browser._browser.get(httpserver.url_for("/"))
+    assert "Current time is 0" in browser._browser.page_source
+    assert "<script>" in browser._browser.page_source
+
+    # Click on the button once to check that it works correctly
+    browser._browser.find_element(selenium.webdriver.common.by.By.ID, "increment").click()
+    assert "Current time is 0" not in browser._browser.page_source
+    assert "Current time is 1" in browser._browser.page_source
+    assert "<script>" in browser._browser.page_source
+
+    # Get the postprocessed HTML source, and check that it does not have any <script> tag
+    postprocessed_source = browser.get_html_source()
+    assert "Current time is 1" in postprocessed_source
+    assert "<script>" not in postprocessed_source
+
+    # Click on the button again to check that it still works, i.e. removing the <script>
+    # tag from the postprocessed source did not affect the live page.
+    browser._browser.find_element(selenium.webdriver.common.by.By.ID, "increment").click()
+    assert "Current time is 0" not in browser._browser.page_source
+    assert "Current time is 1" not in browser._browser.page_source
+    assert "Current time is 2" in browser._browser.page_source
+    assert "<script>" in browser._browser.page_source