Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplifications #3

Draft
wants to merge 4 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions sre_tools/_analyse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from sre_parse import (
CATEGORY,
CHCODES,
error,
IN,
NEGATE,
)


def _handle_error(tok, val, msg):
raise error(msg)


def preprocess(seq, error=_handle_error):
for tok, val in list(seq):
if tok == IN:
negate = val[0] == (NEGATE, None)

cats = sorted(in_cat for in_type, in_cat in val if in_type == CATEGORY)
for i in range(0, len(CHCODES), 2):
if CHCODES[i] in cats and CHCODES[i+1] in cats:
if negate:
_handle_error(tok, val, "cant negate all matches")
else:
yield tok, val, True
break
else:
yield tok, val, cats

else:
yield tok, val, None
14 changes: 10 additions & 4 deletions sre_tools/_simplify.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,21 @@
LITERAL,
MAX_REPEAT,
MAXREPEAT,
MIN_REPEAT,
SubPattern,
SUBPATTERN,
)

from .utils import create_subpattern
from ._analyse import preprocess


def _simplify_sre_list(seq):
new = []

prev_tok = None
prev_val = None
for i, (tok, val) in enumerate(list(seq)):
for i, (tok, val, data) in enumerate(preprocess(seq)):
if tok == prev_tok and (val == prev_val or _val_eq(val, prev_val)):
if tok == MAX_REPEAT:
min_repeat = min(MAXREPEAT, prev_val[0] + val[0])
Expand Down Expand Up @@ -61,12 +63,16 @@ def _simplify_sre_list(seq):
new[-1] = prev_tok, prev_val
continue

elif tok == MAX_REPEAT:
elif tok in (MAX_REPEAT, MIN_REPEAT):
val = (val[0], val[1], create_subpattern(_simplify_sre_list(val[2])))

elif tok == SUBPATTERN:
val = (*val[0:3], create_subpattern(_simplify_sre_list(val[3])))

elif tok == IN:
if data is True:
tok, val = ANY, None

new.append((tok, val))
prev_tok = tok
prev_val = val
Expand All @@ -82,7 +88,7 @@ def _pair_eq(a, b):
val_a = a[1]
val_b = b[1]

if tok == MAX_REPEAT:
if tok in (MAX_REPEAT, MIN_REPEAT):
if val_a[0:2] != val_b[0:2]:
return False
return _val_eq(val_a[-1], val_b[-1])
Expand Down Expand Up @@ -137,7 +143,7 @@ def _val_eq(a, b):

if length == 2:
tok, val = a
if tok == MAX_REPEAT:
if tok in (MAX_REPEAT, MIN_REPEAT):
return _val_eq(val, b[1])
elif tok == SUBPATTERN:
return _pair_eq(a, b)
Expand Down
7 changes: 7 additions & 0 deletions sre_tools/analyse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .utils import _subpattern_invoke
from ._analyse import preprocess


def check_regex(pattern):
_, rv = _subpattern_invoke(pattern, preprocess)
return list(rv)
8 changes: 2 additions & 6 deletions sre_tools/simplify.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
from sre_parse import SubPattern, parse

from .utils import clone_subpattern
from .utils import _subpattern_invoke, clone_subpattern
from ._simplify import _simplify_sre_list


def simplify_regex(pattern):
if not isinstance(pattern, SubPattern):
pattern = parse(pattern)
seq = _simplify_sre_list(pattern.data)
pattern, seq = _subpattern_invoke(pattern, _simplify_sre_list)
return clone_subpattern(pattern, seq)
8 changes: 7 additions & 1 deletion sre_tools/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
from sre_parse import SubPattern
from sre_parse import SubPattern, parse

try:
from sre_parse import Pattern
except ImportError:
from sre_parse import State as Pattern


def _subpattern_invoke(pattern, func):
if not isinstance(pattern, SubPattern):
pattern = parse(pattern)
return pattern, func(pattern.data)


def clone_subpattern(subpattern, data=None):
if not data:
data = subpattern.data
Expand Down
23 changes: 23 additions & 0 deletions tests/test_analyse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from __future__ import unicode_literals

import re
import sre_parse
import unittest

from sre_tools.analyse import check_regex


class TestAnalyseRegex(unittest.TestCase):

def _assert_valid(self, a):
check_regex(a)

def _assert_invalid(self, a, msg):
with self.assertRaisesRegexp(re.error, msg):
check_regex(a)

def test_valid_union(self):
self._assert_valid(r"[\s\S]")

def test_invalid_union_inverted(self):
self._assert_invalid(r"[^\s\S]", "cant negate all matches")
14 changes: 14 additions & 0 deletions tests/test_simplify.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import unicode_literals

import re
import sre_compile
import sre_parse
import unittest
Expand Down Expand Up @@ -47,6 +48,14 @@ def test_merge_repeated_mixed(self):
self._assert_equal(r"aaa", r"a{3}")
self._assert_equal(r"aa{2}", r"a{3}")

def test_merge_class_union(self):
self._assert_equal(r"[\s\S]", r".")
self._assert_equal(r"[\s \S]", r".")

def test_merge_class_union_inverted(self):
with self.assertRaisesRegexp(re.error, "cant negate all matches"):
simplify_regex(r" [^\s\S]")

def test_anchor(self):
self._assert_equal(r"^aaa", r"^a{3}")

Expand All @@ -63,3 +72,8 @@ def test_subpattern_capture_nested(self):
self._assert_equal(
r"(?:(?:[a-z]{,100}){,100}){,100}", r"(?:(?:[a-z]{,100}){,100}){,100}"
)

def test_unused_greedy(self):
self._assert_equal(r"^[\s\S]+?\.", r"^.+?\.")
self._assert_equal(r"^[^.]+?\.", r"^[^.]+\.")
# TODO: simplify that to r"^.+?\."