From 2257e871179f73061766cf4f063d024e4c0c99b0 Mon Sep 17 00:00:00 2001 From: Francesco Ballarin Date: Thu, 22 Aug 2024 18:27:08 +0200 Subject: [PATCH] Add class TuringClassificationSelenium to automate download of classifications and scores from turing --- .../mathrace_interaction/network/__init__.py | 1 + .../network/turing_classification_selenium.py | 153 +++++++++ mathrace_interaction/pyproject.toml | 8 +- .../test_turing_classification_selenium.py | 300 ++++++++++++++++++ 4 files changed, 461 insertions(+), 1 deletion(-) create mode 100644 mathrace_interaction/mathrace_interaction/network/turing_classification_selenium.py create mode 100644 mathrace_interaction/tests/unit/network/test_turing_classification_selenium.py diff --git a/mathrace_interaction/mathrace_interaction/network/__init__.py b/mathrace_interaction/mathrace_interaction/network/__init__.py index b1845cb..218384a 100644 --- a/mathrace_interaction/mathrace_interaction/network/__init__.py +++ b/mathrace_interaction/mathrace_interaction/network/__init__.py @@ -7,3 +7,4 @@ from mathrace_interaction.network.get_ssh_client import get_ssh_client from mathrace_interaction.network.open_file_on_ssh_host import open_file_on_ssh_host +from mathrace_interaction.network.turing_classification_selenium import TuringClassificationSelenium diff --git a/mathrace_interaction/mathrace_interaction/network/turing_classification_selenium.py b/mathrace_interaction/mathrace_interaction/network/turing_classification_selenium.py new file mode 100644 index 0000000..a14e717 --- /dev/null +++ b/mathrace_interaction/mathrace_interaction/network/turing_classification_selenium.py @@ -0,0 +1,153 @@ +# Copyright (C) 2024 by the Turing @ DMF authors +# +# This file is part of Turing @ DMF. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +"""A selenium browser that connects to a classification page on the current live turing instance.""" + +import urllib.parse + +import bs4 +import requests +import selenium.webdriver +import selenium.webdriver.common.by +import selenium.webdriver.support.expected_conditions as EC # noqa: N812 +import selenium.webdriver.support.ui + + +class TuringClassificationSelenium: + """ + A selenium browser that connects to a classification page on the current live turing instance. + + Parameters + ---------- + root_url + URL of the root of the turing website. + race_id + The ID of the turing race to follow. + max_wait + Maximum amount to wait in seconds for the requested page to load fully. + + Attributes + ---------- + _browser + The selenium browser that will be used to connect to the website. + _root_url + URL of the root of the turing website. + _race_id + The ID of the turing race to follow. + _max_wait + Maximum amount to wait in seconds for the requested page to load fully. + """ + + def __init__(self, root_url: str, race_id: int, max_wait: int) -> None: + options = selenium.webdriver.ChromeOptions() + options.add_argument("--no-sandbox") # type: ignore[no-untyped-call] + options.add_argument("--window-size=1920,1080") # type: ignore[no-untyped-call] + options.add_argument("--headless") # type: ignore[no-untyped-call] + options.add_argument("--disable-gpu") # type: ignore[no-untyped-call] + self._browser = selenium.webdriver.Chrome(options=options) + self._root_url = root_url + self._race_id = race_id + self._max_wait = max_wait + self._can_compute_team_scores = False + + def _wait_for_element(self, by: str, value: str) -> None: + selenium.webdriver.support.wait.WebDriverWait(self._browser, self._max_wait).until( + EC.presence_of_element_located((by, value))) # type: ignore[no-untyped-call] + + def _wait_for_classification_computed(self) -> None: + selenium.webdriver.support.wait.WebDriverWait(self._browser, self._max_wait).until( + JavascriptVariableEvaluatesToTrue("document.updated")) + + def login(self, username: str, password: str) -> None: + """Log into the turing instance with the provided credentials.""" + self._browser.get(urllib.parse.urljoin(self._root_url, "accounts/login")) + # Wait for the login button to appear, and send credentials + self._wait_for_element(selenium.webdriver.common.by.By.ID, "submit") + self._browser.find_element(selenium.webdriver.common.by.By.NAME, "username").send_keys(username) + self._browser.find_element(selenium.webdriver.common.by.By.NAME, "password").send_keys(password) + self._browser.find_element(selenium.webdriver.common.by.By.ID, "submit").click() + # Successful login redirects to the home page, where there is a button "Crea una nuova gara" + self._wait_for_element(selenium.webdriver.common.by.By.CSS_SELECTOR, "a[href='/engine/gara/new']") + + def go_to_classification_page(self, classification_type: str, querystring: dict[str, str]) -> None: + """Direct the browser to visit a specific classification type.""" + self._browser.get( + urllib.parse.urljoin(self._root_url, f"engine/classifica/{self._race_id}/{classification_type}")) + # Wait for the classification to be fully computed + self._wait_for_classification_computed() + # Only certain classification views can compute team scores + self._can_compute_team_scores = (classification_type == "squadre") + + def get_team_scores(self) -> list[int]: + """Get the scores of the teams in the race.""" + assert self._can_compute_team_scores + team_id = 1 + scores = [] + while True: + score_elements = self._browser.find_elements( + selenium.webdriver.common.by.By.ID, f"label-points-{team_id}") + if len(score_elements) == 0: + break + else: + scores.append(int(score_elements[0].text)) + team_id += 1 + return scores + + def get_css_sources(self) -> dict[str, str]: + """Get the content of CSS files used in the current page.""" + soup = bs4.BeautifulSoup(self._browser.page_source, "html.parser") + all_css = dict() + + for css in soup.find_all("link", rel="stylesheet"): + # Do not use the current selenium browser to fetch the css content, otherwise + # the browser would move away from the current page. However, since css content + # is static, simply downloading the page via the python package requests suffices. + response = requests.get(urllib.parse.urljoin(self._root_url, css["href"])) + assert response.status_code == 200 + filename = css["href"].split("/")[-1] + assert filename not in all_css, "Cannot have to css files with the same name" + all_css[filename] = response.text + + return all_css + + def get_html_source(self) -> str: + """ + Get the HTML source code of a page of the turing instance for local download. + + The HTML code is preprocessed as follows: + - the path of any css should be flattened to the one returned by get_css_sources. + - any local link to the live instance is removed, since it would not be available locally. + - any javascript is removed, since in order to be visible locally the page cannot contain + any script that requires the live server. + """ + soup = bs4.BeautifulSoup(self._browser.page_source, "html.parser") + + # Flatten css path + for css in soup.find_all("link", rel="stylesheet"): + css["href"] = css["href"].split("/")[-1] + + # Remove local links + for a in soup.select("a[href]"): + assert isinstance(a["href"], str) + if a["href"].startswith("/"): + del a["href"] + + # Remove +Crea nuova gara + +""" + httpserver.expect_request("/accounts/login").respond_with_data(login_page, content_type="text/html") + httpserver.expect_request("/engine").respond_with_data(post_login_page, content_type="text/html") + + browser = Browser(httpserver) + browser.login("admin", "secret") + assert "Processing querystring" not in browser._browser.page_source + assert "username is admin and password is secret" in browser._browser.page_source + assert "Crea nuova gara" in browser._browser.page_source + + +def test_classification_browser_go_to_classification_page(httpserver: pytest_httpserver.HTTPServer) -> None: + """Test mathrace_interaction.network.TuringClassificationSelenium.go_to_classification_page.""" + classification_page = """ + +Processing classification + +""" + httpserver.expect_request("/engine/classifica/0/unica").respond_with_data( + classification_page, content_type="text/html") + + browser = Browser(httpserver) + browser.go_to_classification_page("unica", {}) + assert "Processing classification" not in browser._browser.page_source + assert "Classification has been computed" in browser._browser.page_source + + +def test_classification_browser_get_team_scores(httpserver: pytest_httpserver.HTTPServer) -> None: + """Test mathrace_interaction.network.TuringClassificationSelenium.get_team_scores.""" + classification_page = """ + + +Team 1: 0 +Team 2: 1 +Team 3: 2 + + +""" + httpserver.expect_request("/engine/classifica/0/squadre").respond_with_data( + classification_page, content_type="text/html") + + browser = Browser(httpserver) + browser.go_to_classification_page("squadre", {}) + assert 'Team 1: 0' in browser._browser.page_source + assert 'Team 2: 1' in browser._browser.page_source + assert 'Team 3: 2' in browser._browser.page_source + + # Click on the button once to check that it works correctly + browser._browser.find_element(selenium.webdriver.common.by.By.ID, "increment").click() + assert 'Team 1: 1' in browser._browser.page_source + assert 'Team 2: 3' in browser._browser.page_source + assert 'Team 3: 5' in browser._browser.page_source + + # Get the computed scores, and compare them with the expected ones + computed_scores = browser.get_team_scores() + assert computed_scores == [1, 3, 5] + + # Increment again the scores + browser._browser.find_element(selenium.webdriver.common.by.By.ID, "increment").click() + assert 'Team 1: 2' in browser._browser.page_source + assert 'Team 2: 5' in browser._browser.page_source + assert 'Team 3: 8' in browser._browser.page_source + + # Get the computed scores, and compare them with the expected ones + computed_scores = browser.get_team_scores() + assert computed_scores == [2, 5, 8] + + +def test_classification_browser_get_css_sources(httpserver: pytest_httpserver.HTTPServer) -> None: + """Test mathrace_interaction.network.TuringClassificationSelenium.get_css_sources.""" + index_page = """ + + + + + +
Hello world!
+ +""" + style1_css = """div { + background-color: #ff0000; +}""" + style2_css = """span { + color: #00ffff; +}""" + + httpserver.expect_request("/").respond_with_data(index_page, content_type="text/html") + httpserver.expect_request("/folder1/style1.css").respond_with_data(style1_css, content_type="text/css") + httpserver.expect_request("/folder2/subfolder2/style2.css").respond_with_data(style2_css, content_type="text/css") + + browser = Browser(httpserver) + browser._browser.get(httpserver.url_for("/")) + assert "Hello world!" in browser._browser.page_source + + # Ensure that background color and text color are as expected + container = browser._browser.find_element(selenium.webdriver.common.by.By.ID, "container") + bg_color_rgba = container.value_of_css_property("background-color") + assert selenium.webdriver.support.color.Color.from_string(bg_color_rgba).hex == "#ff0000" + text = browser._browser.find_element(selenium.webdriver.common.by.By.ID, "text") + color_rgba = text.value_of_css_property("color") + assert selenium.webdriver.support.color.Color.from_string(color_rgba).hex == "#00ffff" + + # Get a dictionary containing the sources of all CSS files + all_css = browser.get_css_sources() + assert len(all_css) == 2 + assert "style1.css" in all_css + assert "style2.css" in all_css + assert all_css["style1.css"] == style1_css + assert all_css["style2.css"] == style2_css + + +def test_classification_browser_get_html_source_replaces_css(httpserver: pytest_httpserver.HTTPServer) -> None: + """Test mathrace_interaction.network.TuringClassificationSelenium.get_html_source replaces css file names.""" + index_page = """ + + + + + +
Hello world!
+ +""" + style1_css = """div { + background-color: #ff0000; +}""" + style2_css = """span { + color: #00ffff; +}""" + + httpserver.expect_request("/").respond_with_data(index_page, content_type="text/html") + httpserver.expect_request("/folder1/style1.css").respond_with_data(style1_css, content_type="text/css") + httpserver.expect_request("/folder2/subfolder2/style2.css").respond_with_data(style2_css, content_type="text/css") + + browser = Browser(httpserver) + browser._browser.get(httpserver.url_for("/")) + assert "Hello world!" in browser._browser.page_source + assert '"/folder1/style1.css"' in browser._browser.page_source + assert '"/folder2/subfolder2/style2.css"' in browser._browser.page_source + + # Get the postprocessed HTML source, and check that it contains the flattened css names + postprocessed_source = browser.get_html_source() + assert '"style1.css"' in postprocessed_source + assert '"style2.css"' in postprocessed_source + assert '"/folder1/style1.css"' not in postprocessed_source + assert '"/folder2/subfolder2/style2.css"' not in postprocessed_source + + # Ensure that background color and text color are as expected, i.e. changing the file names + # in the postprocessed source did not affect the live page. + container = browser._browser.find_element(selenium.webdriver.common.by.By.ID, "container") + bg_color_rgba = container.value_of_css_property("background-color") + assert selenium.webdriver.support.color.Color.from_string(bg_color_rgba).hex == "#ff0000" + text = browser._browser.find_element(selenium.webdriver.common.by.By.ID, "text") + color_rgba = text.value_of_css_property("color") + assert selenium.webdriver.support.color.Color.from_string(color_rgba).hex == "#00ffff" + + +def test_classification_browser_get_html_source_strips_links(httpserver: pytest_httpserver.HTTPServer) -> None: + """Test that mathrace_interaction.network.TuringClassificationSelenium.get_html_source strips links from output.""" + start_page = """ + +Redirect + +""" + redirect_page = """ + +Success! + +""" + httpserver.expect_request("/").respond_with_data(start_page, content_type="text/html") + httpserver.expect_request("/redirect").respond_with_data(redirect_page, content_type="text/html") + + browser = Browser(httpserver) + browser._browser.get(httpserver.url_for("/")) + assert "Redirect" in browser._browser.page_source + assert "/redirect" in browser._browser.page_source + assert " None: + """Test that mathrace_interaction.network.TuringClassificationSelenium.get_html_source strips js from output.""" + javascript_button_page = """ + + +Current time is 0 + + +""" + httpserver.expect_request("/").respond_with_data(javascript_button_page, content_type="text/html") + + browser = Browser(httpserver) + browser._browser.get(httpserver.url_for("/")) + assert "Current time is 0" in browser._browser.page_source + assert "