-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add class TuringClassificationSelenium to automate download of classi…
…fications and scores from turing
- Loading branch information
1 parent
7753371
commit 2257e87
Showing
4 changed files
with
461 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
153 changes: 153 additions & 0 deletions
153
mathrace_interaction/mathrace_interaction/network/turing_classification_selenium.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
# Copyright (C) 2024 by the Turing @ DMF authors | ||
# | ||
# This file is part of Turing @ DMF. | ||
# | ||
# SPDX-License-Identifier: AGPL-3.0-or-later | ||
"""A selenium browser that connects to a classification page on the current live turing instance.""" | ||
|
||
import urllib.parse | ||
|
||
import bs4 | ||
import requests | ||
import selenium.webdriver | ||
import selenium.webdriver.common.by | ||
import selenium.webdriver.support.expected_conditions as EC # noqa: N812 | ||
import selenium.webdriver.support.ui | ||
|
||
|
||
class TuringClassificationSelenium: | ||
""" | ||
A selenium browser that connects to a classification page on the current live turing instance. | ||
Parameters | ||
---------- | ||
root_url | ||
URL of the root of the turing website. | ||
race_id | ||
The ID of the turing race to follow. | ||
max_wait | ||
Maximum amount to wait in seconds for the requested page to load fully. | ||
Attributes | ||
---------- | ||
_browser | ||
The selenium browser that will be used to connect to the website. | ||
_root_url | ||
URL of the root of the turing website. | ||
_race_id | ||
The ID of the turing race to follow. | ||
_max_wait | ||
Maximum amount to wait in seconds for the requested page to load fully. | ||
""" | ||
|
||
def __init__(self, root_url: str, race_id: int, max_wait: int) -> None: | ||
options = selenium.webdriver.ChromeOptions() | ||
options.add_argument("--no-sandbox") # type: ignore[no-untyped-call] | ||
options.add_argument("--window-size=1920,1080") # type: ignore[no-untyped-call] | ||
options.add_argument("--headless") # type: ignore[no-untyped-call] | ||
options.add_argument("--disable-gpu") # type: ignore[no-untyped-call] | ||
self._browser = selenium.webdriver.Chrome(options=options) | ||
self._root_url = root_url | ||
self._race_id = race_id | ||
self._max_wait = max_wait | ||
self._can_compute_team_scores = False | ||
|
||
def _wait_for_element(self, by: str, value: str) -> None: | ||
selenium.webdriver.support.wait.WebDriverWait(self._browser, self._max_wait).until( | ||
EC.presence_of_element_located((by, value))) # type: ignore[no-untyped-call] | ||
|
||
def _wait_for_classification_computed(self) -> None: | ||
selenium.webdriver.support.wait.WebDriverWait(self._browser, self._max_wait).until( | ||
JavascriptVariableEvaluatesToTrue("document.updated")) | ||
|
||
def login(self, username: str, password: str) -> None: | ||
"""Log into the turing instance with the provided credentials.""" | ||
self._browser.get(urllib.parse.urljoin(self._root_url, "accounts/login")) | ||
# Wait for the login button to appear, and send credentials | ||
self._wait_for_element(selenium.webdriver.common.by.By.ID, "submit") | ||
self._browser.find_element(selenium.webdriver.common.by.By.NAME, "username").send_keys(username) | ||
self._browser.find_element(selenium.webdriver.common.by.By.NAME, "password").send_keys(password) | ||
self._browser.find_element(selenium.webdriver.common.by.By.ID, "submit").click() | ||
# Successful login redirects to the home page, where there is a button "Crea una nuova gara" | ||
self._wait_for_element(selenium.webdriver.common.by.By.CSS_SELECTOR, "a[href='/engine/gara/new']") | ||
|
||
def go_to_classification_page(self, classification_type: str, querystring: dict[str, str]) -> None: | ||
"""Direct the browser to visit a specific classification type.""" | ||
self._browser.get( | ||
urllib.parse.urljoin(self._root_url, f"engine/classifica/{self._race_id}/{classification_type}")) | ||
# Wait for the classification to be fully computed | ||
self._wait_for_classification_computed() | ||
# Only certain classification views can compute team scores | ||
self._can_compute_team_scores = (classification_type == "squadre") | ||
|
||
def get_team_scores(self) -> list[int]: | ||
"""Get the scores of the teams in the race.""" | ||
assert self._can_compute_team_scores | ||
team_id = 1 | ||
scores = [] | ||
while True: | ||
score_elements = self._browser.find_elements( | ||
selenium.webdriver.common.by.By.ID, f"label-points-{team_id}") | ||
if len(score_elements) == 0: | ||
break | ||
else: | ||
scores.append(int(score_elements[0].text)) | ||
team_id += 1 | ||
return scores | ||
|
||
def get_css_sources(self) -> dict[str, str]: | ||
"""Get the content of CSS files used in the current page.""" | ||
soup = bs4.BeautifulSoup(self._browser.page_source, "html.parser") | ||
all_css = dict() | ||
|
||
for css in soup.find_all("link", rel="stylesheet"): | ||
# Do not use the current selenium browser to fetch the css content, otherwise | ||
# the browser would move away from the current page. However, since css content | ||
# is static, simply downloading the page via the python package requests suffices. | ||
response = requests.get(urllib.parse.urljoin(self._root_url, css["href"])) | ||
assert response.status_code == 200 | ||
filename = css["href"].split("/")[-1] | ||
assert filename not in all_css, "Cannot have to css files with the same name" | ||
all_css[filename] = response.text | ||
|
||
return all_css | ||
|
||
def get_html_source(self) -> str: | ||
""" | ||
Get the HTML source code of a page of the turing instance for local download. | ||
The HTML code is preprocessed as follows: | ||
- the path of any css should be flattened to the one returned by get_css_sources. | ||
- any local link to the live instance is removed, since it would not be available locally. | ||
- any javascript is removed, since in order to be visible locally the page cannot contain | ||
any script that requires the live server. | ||
""" | ||
soup = bs4.BeautifulSoup(self._browser.page_source, "html.parser") | ||
|
||
# Flatten css path | ||
for css in soup.find_all("link", rel="stylesheet"): | ||
css["href"] = css["href"].split("/")[-1] | ||
|
||
# Remove local links | ||
for a in soup.select("a[href]"): | ||
assert isinstance(a["href"], str) | ||
if a["href"].startswith("/"): | ||
del a["href"] | ||
|
||
# Remove <script> tags | ||
for script in soup.select("script"): | ||
script.decompose() | ||
|
||
# Return postprocessed page | ||
return str(soup) | ||
|
||
|
||
class JavascriptVariableEvaluatesToTrue: | ||
"""Helper class used to wait until a javascript variable is true.""" | ||
|
||
def __init__(self, variable: str) -> None: | ||
self._variable = variable | ||
|
||
def __call__(self, driver: selenium.webdriver.Chrome) -> bool: | ||
"""Condition for waiting until the javascript variable is true.""" | ||
return driver.execute_script(f"return {self._variable};") # type: ignore[no-any-return, no-untyped-call] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.