From ca6720f5b748e0a7d1c5688ab337cfcdafb66136 Mon Sep 17 00:00:00 2001
From: Eric Herman <eric@freesa.org>
Date: Thu, 26 Oct 2023 16:16:39 +0200
Subject: [PATCH] add --dry-run, transform before ignore

---
 url-check.py      | 71 ++++++++++++++++++++++++++++-------------------
 url-check.test.py | 54 ++++++++++++++++-------------------
 2 files changed, 66 insertions(+), 59 deletions(-)

diff --git a/url-check.py b/url-check.py
index 90bfa7c..d9ff719 100755
--- a/url-check.py
+++ b/url-check.py
@@ -42,6 +42,7 @@
         -t SECONDS, --timeout=SECONDS
                                 timeout set on the request
                                 [default: {default_timeout}]
+        -d, --dry-run           do not fetch the URLs or update the checks
 
         -h, --help              Prints this message
         -V, --version           Prints the version ({url_check_version})
@@ -86,8 +87,9 @@ def shell_slurp(cmd_str, working_dir=os.getcwd(), ctx=None, fail_func=None):
 	if (result.returncode and fail_func):
 		return fail_func(result)
 
-	text = result.stdout.decode("utf-8")
+	text = result.stdout.decode("utf-8").strip()
 	ctx.debug(text)
+
 	return text
 
 
@@ -115,18 +117,31 @@ def files_from_repo(repos_basedir, repo_name, repo_url, branch, ctx=None):
 	return files
 
 
-def urls_from(workdir, file, user_ignore_patterns=[], ctx=None):
+def urls_from(workdir, file, transforms, user_ignore_patterns=[], ctx=None):
+
 	# pull URLs out of the file, including option leading paren
 	# TODO: Regex does not fully conform to RFC 3986 URI Generic Syntax.
 	#	Some valid characters are only valid in parts of the URI.
-	#	Some valid characters are not matched by the current regex,
-	#		e.g.: http://example.org/foo#named-anchor
+	#	Some valid characters are not matched by the current regex
+	# Note: Single quote escaping is hard to read, in a bash single quoted
+	#	string, the sequence {'"'"'} becomes {'} by:
+	#	* ending the single-quoted string
+	#	* starting a double-quoted string
+	#	* having a single quote inside the double quotes
+	#	* ending the double-quoted string
+	#	* starting a new single-quoted string
+	#	Below, the single quotes need to be escaped by python:
+	url_pattern = 'http[s]?://[^[:space:]<>"`\'"\'"\']+'
 	cmd_str = f"grep --extended-regexp --only-matching --text \
-		'[\\(]?(http|https)://[-a-zA-Z0-9\./\\?=_%:\\(\\)]*' \
+		'[\\(]?{url_pattern}' \
 		'{file}'"
 
+	for transform in transforms:
+		cmd_str = cmd_str + f" | {transform}"
+
 	# remove surrounding parens if they exist
 	cmd_str += " | sed -e 's/^(http\\(.*\\))[\\.,]\\?$/http\\1/g'"
+
 	# de-duplicate
 	cmd_str += " | sort --unique"
 
@@ -164,28 +179,10 @@ def clear_previous_used(checks, name):
 			checks[url]["used"][name] = []
 
 
-def transform_urls(transforms, urls, ctx):
-	urls_str = "\n".join(urls)
-
-	cmd = f"echo '{urls_str}'"
-	for transform in transforms:
-		cmd = cmd + f" | {transform}"
-
-	urls_str = shell_slurp(cmd, ".", ctx)
-
-	transformed_urls = []
-	for line in urls_str.splitlines():
-		if line.startswith("http"):
-			transformed_urls += [line]
-
-	return transformed_urls
-
-
 def set_used_for_file(
 		checks, gits_dir, name, file, ignore_patterns, transforms, ctx):
 	repo_dir = os.path.join(gits_dir, name)
-	urls = urls_from(repo_dir, file, ignore_patterns, ctx)
-	urls = transform_urls(transforms, urls, ctx)
+	urls = urls_from(repo_dir, file, transforms, ignore_patterns, ctx)
 	for url in urls:
 		if url not in checks.keys():
 			checks[url] = {}
@@ -268,6 +265,7 @@ def status_code_for_url(url, timeout, ctx=None):
 class System_Context:
 
 	verbose = False
+	dry_run = False
 
 	def now(self):
 		return str(datetime.datetime.utcnow())
@@ -340,7 +338,10 @@ def update_status_codes_for_urls(urls, checks, timeout, ctx):
 		ctx.log("")
 		when = ctx.now()
 		ctx.log(when, url)
-		status_code = status_code_for_url(url, timeout, ctx)
+		if ctx.dry_run:
+			status_code = -1
+		else:
+			status_code = status_code_for_url(url, timeout, ctx)
 		ctx.log(status_code, url)
 		update_status(checks[url]["checks"], status_code, when, ctx)
 		updated.append(checks[url])
@@ -348,11 +349,18 @@ def update_status_codes_for_urls(urls, checks, timeout, ctx):
 	return updated
 
 
-def group_by_second_level_domain(urls):
+def group_by_second_level_domain(urls, ctx):
 	domain_dict = {}
 
 	for url in sorted(set(urls)):
-		parsed_url = urllib.parse.urlparse(url)
+		try:
+			parsed_url = urllib.parse.urlparse(url)
+		except Exception as e:
+			if ctx == None:
+				ctx = default_context()
+			ctx.debug({'url': url, 'error': e})
+			continue
+
 		domain = parsed_url.netloc
 		# split by dot; get the last two parts
 		domain_parts = domain.split('.')[-2:]
@@ -389,7 +397,7 @@ def url_check_all(gits_dir,
 
 	checks = sort_by_key(checks)
 
-	domain_dict = group_by_second_level_domain(checks.keys())
+	domain_dict = group_by_second_level_domain(checks.keys(), ctx)
 	pool = multiprocessing.Pool(processes=16)
 	pfunc = functools.partial(
 			update_status_codes_for_urls, checks=checks, timeout=timeout, ctx=ctx)
@@ -466,6 +474,7 @@ def main(sys_argv=sys.argv, ctx=default_context()):
 		ctx.log(f"version {url_check_version}")
 		return
 
+	ctx.dry_run = args['--dry-run']
 	gits_dir = args['--gits-dir']
 	cfg_path = args['--config']
 	checks_path = args['--results']
@@ -475,6 +484,7 @@ def main(sys_argv=sys.argv, ctx=default_context()):
 	repos_info = config_obj["repositories"]
 	ignore_patterns_map = config_obj.get("ignore_patterns", {})
 	add_ignore_patterns = ignore_patterns_map.keys()
+	# TODO: transforms_map should be ordered, perhaps convert to list?
 	transforms_map = config_obj.get("transforms", {})
 	transforms = transforms_map.keys()
 
@@ -484,10 +494,13 @@ def main(sys_argv=sys.argv, ctx=default_context()):
 	checks = url_check_all(gits_dir, orig_checks, repos_files, timeout,
 			add_ignore_patterns, transforms, ctx)
 
+	if ctx.dry_run:
+		ctx.log(checks)
+		return
+
 	write_json(checks_path, checks)
 	condensed = condense_results(checks, repos_info.keys())
 	write_json(check_fails_json, condensed)
-
 	repo_results(repos_info, checks, checks_path, check_fails_json)
 
 
diff --git a/url-check.test.py b/url-check.test.py
index 4640883..ce1062b 100755
--- a/url-check.test.py
+++ b/url-check.test.py
@@ -13,10 +13,11 @@
 
 class Test_Context:
 
-	def __init__(self, capture=False, verbose=False):
+	def __init__(self, capture=False, verbose=False, dry_run=False):
 		self.now_calls = 0
 		self.now_time = ""
 		self.verbose = verbose
+		self.dry_run = dry_run
 		self.capture = capture
 		self.out = ''
 
@@ -77,15 +78,34 @@ def test_urls_from(self):
 		name = "url-check"
 		workdir = os.path.join(gits_dir, name)
 		file = "url-check.test.py"
-		our_ignore_patters = ['^http[s]\?://bogus.gov']
-		ctx = Test_Context()
-		found = uc.urls_from(workdir, file, our_ignore_patters, ctx)
+		transform_urls = [
+				'https://example.com/one.html).',
+				'https://example.com/obsolete.html',
+				'https://example.com/three.html',
+		]
+		transforms = [
+				f"sed 's@obsolete\\.html@TEMPFIX.html@g'",
+				f"sed 's@TEMPFIX\\.html@two.html@g'",
+				f"sed 's@\\(.*html\\)[\\.,):!]*$@\\1@g'",
+		]
+		ignores = ['^http[s]\?://bogus.gov']
+		# ctx = Test_Context()
+		ctx = Test_Context(capture=True)
+		found = uc.urls_from(workdir, file, transforms, ignores, ctx)
+		print(ctx.out)
 		self.assertIn("https://example.org/", found)
 		self.assertNotIn("http://bogus.gov", found)
 		self.assertIn(paren_url, found)
 		self.assertIn('http://example.org/' + 'b-(baz)', found)
 		self.assertNotIn('http://example.org/' + 'b-(baz))', found)
 		self.assertIn('http://example.org/index.html', found)
+		# transforms
+		self.assertNotIn('https://example.com/' + 'one.html).', found)
+		self.assertNotIn('https://example.com/' + 'obsolete.html', found)
+		self.assertNotIn('https://example.com/' + 'TEMPFIX.html', found)
+		self.assertIn('https://example.com/' + 'one.html', found)
+		self.assertIn('https://example.com/' + 'two.html', found)
+		self.assertIn('https://example.com/' + 'three.html', found)
 
 	def test_clear_previous_used(self):
 		name1 = "blog.example.net"
@@ -201,32 +221,6 @@ def test_read_repos_files(self):
 		self.assertIn("url-check.test.py", repo_files[repo_name])
 		self.assertNotIn("README.md", repo_files[repo_name])
 
-	def test_transform_urls(self):
-		ctx = Test_Context()
-		ctx.capture = True
-		ctx.verbose = True
-		transforms = []
-		urls = [
-				'https://example.org/one.html',
-				'https://example.org/obsolete.html',
-				'https://example.org/three.html',
-		]
-		transformed = uc.transform_urls(transforms, urls, ctx)
-		self.assertEqual(transformed, urls)
-
-		transforms = [
-				f"sed 's@obsolete\\.html@foo.html)@g'",
-				f"sed 's@foo@two@g'",
-				f"sed 's@\\(example.org/.*\\)[\\.,)]$@\\1@g'",
-		]
-		expected_urls = [
-				'https://example.org/one.html',
-				'https://example.org/two.html',
-				'https://example.org/three.html',
-		]
-		transformed = uc.transform_urls(transforms, urls, ctx)
-		self.assertEqual(transformed, expected_urls, ctx.out)
-
 	def test_remove_unused(self):
 		url3 = "https://example.org/three.html"
 		checks = {