Skip to content

Commit

Permalink
Add class TuringClassificationSelenium to automate download of classi…
Browse files Browse the repository at this point in the history
…fications and scores from turing
  • Loading branch information
francesco-ballarin committed Aug 22, 2024
1 parent 7753371 commit 2257e87
Show file tree
Hide file tree
Showing 4 changed files with 461 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@

from mathrace_interaction.network.get_ssh_client import get_ssh_client
from mathrace_interaction.network.open_file_on_ssh_host import open_file_on_ssh_host
from mathrace_interaction.network.turing_classification_selenium import TuringClassificationSelenium
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Copyright (C) 2024 by the Turing @ DMF authors
#
# This file is part of Turing @ DMF.
#
# SPDX-License-Identifier: AGPL-3.0-or-later
"""A selenium browser that connects to a classification page on the current live turing instance."""

import urllib.parse

import bs4
import requests
import selenium.webdriver
import selenium.webdriver.common.by
import selenium.webdriver.support.expected_conditions as EC # noqa: N812
import selenium.webdriver.support.ui


class TuringClassificationSelenium:
"""
A selenium browser that connects to a classification page on the current live turing instance.
Parameters
----------
root_url
URL of the root of the turing website.
race_id
The ID of the turing race to follow.
max_wait
Maximum amount to wait in seconds for the requested page to load fully.
Attributes
----------
_browser
The selenium browser that will be used to connect to the website.
_root_url
URL of the root of the turing website.
_race_id
The ID of the turing race to follow.
_max_wait
Maximum amount to wait in seconds for the requested page to load fully.
"""

def __init__(self, root_url: str, race_id: int, max_wait: int) -> None:
options = selenium.webdriver.ChromeOptions()
options.add_argument("--no-sandbox") # type: ignore[no-untyped-call]
options.add_argument("--window-size=1920,1080") # type: ignore[no-untyped-call]
options.add_argument("--headless") # type: ignore[no-untyped-call]
options.add_argument("--disable-gpu") # type: ignore[no-untyped-call]
self._browser = selenium.webdriver.Chrome(options=options)
self._root_url = root_url
self._race_id = race_id
self._max_wait = max_wait
self._can_compute_team_scores = False

def _wait_for_element(self, by: str, value: str) -> None:
selenium.webdriver.support.wait.WebDriverWait(self._browser, self._max_wait).until(
EC.presence_of_element_located((by, value))) # type: ignore[no-untyped-call]

def _wait_for_classification_computed(self) -> None:
selenium.webdriver.support.wait.WebDriverWait(self._browser, self._max_wait).until(
JavascriptVariableEvaluatesToTrue("document.updated"))

def login(self, username: str, password: str) -> None:
"""Log into the turing instance with the provided credentials."""
self._browser.get(urllib.parse.urljoin(self._root_url, "accounts/login"))
# Wait for the login button to appear, and send credentials
self._wait_for_element(selenium.webdriver.common.by.By.ID, "submit")
self._browser.find_element(selenium.webdriver.common.by.By.NAME, "username").send_keys(username)
self._browser.find_element(selenium.webdriver.common.by.By.NAME, "password").send_keys(password)
self._browser.find_element(selenium.webdriver.common.by.By.ID, "submit").click()
# Successful login redirects to the home page, where there is a button "Crea una nuova gara"
self._wait_for_element(selenium.webdriver.common.by.By.CSS_SELECTOR, "a[href='/engine/gara/new']")

def go_to_classification_page(self, classification_type: str, querystring: dict[str, str]) -> None:
"""Direct the browser to visit a specific classification type."""
self._browser.get(
urllib.parse.urljoin(self._root_url, f"engine/classifica/{self._race_id}/{classification_type}"))
# Wait for the classification to be fully computed
self._wait_for_classification_computed()
# Only certain classification views can compute team scores
self._can_compute_team_scores = (classification_type == "squadre")

def get_team_scores(self) -> list[int]:
"""Get the scores of the teams in the race."""
assert self._can_compute_team_scores
team_id = 1
scores = []
while True:
score_elements = self._browser.find_elements(
selenium.webdriver.common.by.By.ID, f"label-points-{team_id}")
if len(score_elements) == 0:
break
else:
scores.append(int(score_elements[0].text))
team_id += 1
return scores

def get_css_sources(self) -> dict[str, str]:
"""Get the content of CSS files used in the current page."""
soup = bs4.BeautifulSoup(self._browser.page_source, "html.parser")
all_css = dict()

for css in soup.find_all("link", rel="stylesheet"):
# Do not use the current selenium browser to fetch the css content, otherwise
# the browser would move away from the current page. However, since css content
# is static, simply downloading the page via the python package requests suffices.
response = requests.get(urllib.parse.urljoin(self._root_url, css["href"]))
assert response.status_code == 200
filename = css["href"].split("/")[-1]
assert filename not in all_css, "Cannot have to css files with the same name"
all_css[filename] = response.text

return all_css

def get_html_source(self) -> str:
"""
Get the HTML source code of a page of the turing instance for local download.
The HTML code is preprocessed as follows:
- the path of any css should be flattened to the one returned by get_css_sources.
- any local link to the live instance is removed, since it would not be available locally.
- any javascript is removed, since in order to be visible locally the page cannot contain
any script that requires the live server.
"""
soup = bs4.BeautifulSoup(self._browser.page_source, "html.parser")

# Flatten css path
for css in soup.find_all("link", rel="stylesheet"):
css["href"] = css["href"].split("/")[-1]

# Remove local links
for a in soup.select("a[href]"):
assert isinstance(a["href"], str)
if a["href"].startswith("/"):
del a["href"]

# Remove <script> tags
for script in soup.select("script"):
script.decompose()

# Return postprocessed page
return str(soup)


class JavascriptVariableEvaluatesToTrue:
"""Helper class used to wait until a javascript variable is true."""

def __init__(self, variable: str) -> None:
self._variable = variable

def __call__(self, driver: selenium.webdriver.Chrome) -> bool:
"""Condition for waiting until the javascript variable is true."""
return driver.execute_script(f"return {self._variable};") # type: ignore[no-any-return, no-untyped-call]
8 changes: 7 additions & 1 deletion mathrace_interaction/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@ classifiers = [
"Topic :: Software Development :: Libraries :: Python Modules"
]
dependencies = [
"bs4",
"jsondiff",
"paramiko"
"paramiko",
"requests",
"selenium"
]

[project.urls]
Expand All @@ -47,7 +50,9 @@ lint = [
"isort",
"mypy",
"ruff",
"types-beautifulsoup4",
"types-paramiko",
"types-requests",
"yamllint"
]
tests = [
Expand All @@ -56,6 +61,7 @@ tests = [
"pycryptodomex",
"pytest",
"pytest-django",
"pytest_httpserver",
"pytest-random-order"
]

Expand Down
Loading

0 comments on commit 2257e87

Please sign in to comment.