Skip to content

Commit

Permalink
Merge pull request #100 from openzim/ignore_localhost
Browse files Browse the repository at this point in the history
Also ignore assets hosted on localhost
  • Loading branch information
benoit74 authored Dec 6, 2024
2 parents b3c8911 + 2ce67c5 commit afbb3bf
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 2 deletions.
8 changes: 6 additions & 2 deletions scraper/src/mindtouch2zim/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,12 @@
VERSION = __version__
ROOT_DIR = pathlib.Path(__file__).parent

# Loading the CSS leads to many bad assets at these URLs, we just ignore them
STANDARD_KNOWN_BAD_ASSETS_REGEX = r"https?:\/\/a\.mtstatic\.com/@(cache|style)"
# Loading the CSS leads to many bad assets at a.mtstatic.com/@cache or
# a.mtstatic.com/@style we just ignore them
# Multiple images are badly loaded from localhost (=> no need to retry these)
STANDARD_KNOWN_BAD_ASSETS_REGEX = (
r"https?:\/\/(a\.mtstatic\.com\/@(cache|style)|localhost(:|\/))"
)

# logger to use everywhere (not part of Context class because we need it early, before
# Context has been initialized)
Expand Down
16 changes: 16 additions & 0 deletions scraper/tests/test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,19 @@ def test_context_setup_again(context_defaults):
context = Context.get()
assert context.title == "A title"
assert context == processor_context # check both objects are same


@pytest.mark.parametrize(
"url, matching",
[
pytest.param("http://localhost:9999/foo", True, id="localhost1"),
pytest.param("https://localhost/foo/bar.html", True, id="localhost2"),
pytest.param("http://a.mtstatic.com/@cache/bar", True, id="mtstatic_cache"),
pytest.param("https://a.mtstatic.com/@style/bar", True, id="mtstatic_style"),
pytest.param("https://a.mtstatic.com/@stule/bar", False, id="mtstatic_stule"),
pytest.param("https://aamtstaticacom/@style/bar", False, id="replace_dots"),
],
)
def test_context_bad_assets(url: str, *, matching: bool):
matches = Context.bad_assets_regex.findall(url)
assert matches if matching else not matches

0 comments on commit afbb3bf

Please sign in to comment.