From 8af97aa4cd487d718bf0e8266e6fc4cab7a96db1 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Thu, 13 Dec 2018 08:39:51 -0600 Subject: [PATCH 1/5] Whitespace fixes CSS selectors should be evaluated with CSS whitespace rules. CDATA, declarations, and processing instructions should not be considered in `:empty`. --- docs/src/dictionary/en-custom.txt | 1 + docs/src/markdown/about/changelog.md | 5 ++++ docs/src/markdown/selectors.md | 4 +-- soupsieve/css_match.py | 4 +-- soupsieve/css_parser.py | 45 ++++++++++++++-------------- soupsieve/util.py | 4 ++- tests/test_extra.py | 15 ++++++++++ 7 files changed, 50 insertions(+), 28 deletions(-) diff --git a/docs/src/dictionary/en-custom.txt b/docs/src/dictionary/en-custom.txt index e6b244f9..48ce11fe 100644 --- a/docs/src/dictionary/en-custom.txt +++ b/docs/src/dictionary/en-custom.txt @@ -2,6 +2,7 @@ API Accessors Aspell BeautifulSoup +CDATA CSS CSS's Changelog diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md index 6e371780..45dfce37 100644 --- a/docs/src/markdown/about/changelog.md +++ b/docs/src/markdown/about/changelog.md @@ -1,5 +1,10 @@ # Changelog +## 1.0.0 + +- **FIX**: CSS selectors should be evaluated with CSS whitespace rules. +- **FIX**: Processing instructions, CDATA, and declarations should all be ignored from child considerations for `:empty`. + ## 1.0.0b1 - **NEW**: Add support for non-standard `:contains()` selector. diff --git a/docs/src/markdown/selectors.md b/docs/src/markdown/selectors.md index 9b98fdca..1661ab94 100644 --- a/docs/src/markdown/selectors.md +++ b/docs/src/markdown/selectors.md @@ -54,9 +54,7 @@ Selector | Example | Descript `:empty` | `#!css p:empty` | Selects every `#!html

` element that has no children and either no text. Whitespace and comments are ignored. !!! warning "Experimental Selectors" - `:has()` implementation is experimental and may change. There are currently no reference implementation available in any browsers, not to mention the CSS4 specifications have not been finalized, so current implementation is based on our best interpretation. - - Recent addition of `:nth-*`, `:first-*`, `:last-*`, and `:only-*` is experimental. It has been implemented to the best of our understanding, especially `of S` support. Any issues with should be reported. + `:has()` and `of S` support (in `:nth-child(an+b [of S]?)`) is experimental and may change. There are currently no reference implementations available in any browsers, not to mention the CSS4 specifications have not been finalized, so current implementation is based on our best interpretation. Any issues should be reported. ## Custom Selectors diff --git a/soupsieve/css_match.py b/soupsieve/css_match.py index 11609a46..797981ee 100644 --- a/soupsieve/css_match.py +++ b/soupsieve/css_match.py @@ -5,7 +5,7 @@ from .util import deprecated # Empty tag pattern (whitespace okay) -RE_NOT_EMPTY = re.compile('[^ \t\r\n]') +RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') # Relationships REL_PARENT = ' ' @@ -395,7 +395,7 @@ def has_child(self, el): found_child = False for child in el.children: - if isinstance(child, util.CHILD): + if isinstance(child, util.TAG): found_child = True break return found_child diff --git a/soupsieve/css_parser.py b/soupsieve/css_parser.py index 8bf9032d..b4f823ac 100644 --- a/soupsieve/css_parser.py +++ b/soupsieve/css_parser.py @@ -8,17 +8,15 @@ # Sub-patterns parts -CSS_ESCAPES = r'(?:\\[a-f0-9]{1,6}[ ]?|\\.)' +WS = r'[ \t\r\n\f]' -NTH = r'(?:[-+])?(?:\d+n?|n)(?:(?<=n)\s*(?:[-+])\s*(?:\d+))?' +CSS_ESCAPES = r'(?:\\[a-f0-9]{{1,6}}{ws}?|\\.)'.format(ws=WS) -VALUE = r'''(?P"(?:\\.|[^\\"]+)*?"|'(?:\\.|[^\\']+)*?'|(?:[^'"\[\] \t\r\n]|{esc})+)'''.format(esc=CSS_ESCAPES) +NTH = r'(?:[-+])?(?:\d+n?|n)(?:(?<=n){ws}*(?:[-+]){ws}*(?:\d+))?'.format(ws=WS) -ATTR = r''' -(?:\s*(?P[~^|*$]?=)\s* # compare -{value} -(?P[ ]+[is])?)?\s*\] # case sensitive -'''.format(value=VALUE) +VALUE = r'''(?P"(?:\\.|[^\\"]+)*?"|'(?:\\.|[^\\']+)*?'|(?:[^'"\[\] \f\t\r\n]|{esc})+)'''.format(esc=CSS_ESCAPES) + +ATTR = r'''(?:{ws}*(?P[~^|*$]?=){ws}*{value}(?P{ws}+[is])?)?{ws}*\]'''.format(ws=WS, value=VALUE) # Selector patterns PAT_ID = r'#(?:[-\w]|{esc})+'.format(esc=CSS_ESCAPES) @@ -30,41 +28,41 @@ PAT_XML_TAG = r'(?:(?:(?:[-\w.]|{esc})+|\*)?\|)?(?:(?:[-\w.]|{esc})+|\*)'.format(esc=CSS_ESCAPES) PAT_HTML_ATTR = r'''(?x) -\[\s*(?P(?:(?:(?:[-\w]|{esc})+|\*)?\|)?(?:[-\w]|{esc})+) +\[{ws}*(?P(?:(?:(?:[-\w]|{esc})+|\*)?\|)?(?:[-\w]|{esc})+) {attr} -'''.format(esc=CSS_ESCAPES, attr=ATTR) +'''.format(ws=WS, esc=CSS_ESCAPES, attr=ATTR) PAT_XML_ATTR = r'''(?x) -\[\s*(?P(?:(?:(?:[-\w]|{esc})+|\*)?\|)?(?:[-\w.]|{esc})+) +\[{ws}*(?P(?:(?:(?:[-\w]|{esc})+|\*)?\|)?(?:[-\w.]|{esc})+) {attr} -'''.format(esc=CSS_ESCAPES, attr=ATTR) +'''.format(ws=WS, esc=CSS_ESCAPES, attr=ATTR) PAT_PSEUDO_OPEN = r':(?:has|is|matches|not|where)\(' -PAT_PSEUDO_CLOSE = r'\)' +PAT_PSEUDO_CLOSE = r'{ws}*\)'.format(ws=WS) PAT_PSEUDO = r':(?:empty|root|(?:first|last|only)-(?:child|of-type))\b' PAT_PSEUDO_NTH_CHILD = r'''(?x) (?P:nth-(?:last-)?child -\(\s*(?P{nth}|even|odd)\s*(?:\)|(?<=\s)of\s+)) -'''.format(nth=NTH) +\({ws}*(?P{nth}|even|odd){ws}*(?:\)|(?<={ws})of{ws}+)) +'''.format(ws=WS, nth=NTH) PAT_PSEUDO_NTH_TYPE = r'''(?x) (?P:nth-(?:last-)?of-type -\(\s*(?P{nth}|even|odd)\s*\)) -'''.format(nth=NTH) +\({ws}*(?P{nth}|even|odd){ws}*\)) +'''.format(ws=WS, nth=NTH) -PAT_SPLIT = r'\s*?(?P[,+>~]|[ ](?![,+>~]))\s*' +PAT_SPLIT = r'{ws}*?(?P[,+>~]|{ws}(?![,+>~])){ws}*'.format(ws=WS) # Extra selector patterns -PAT_CONTAINS = r':contains\(\s*{value}\s*\)'.format(value=VALUE) +PAT_CONTAINS = r':contains\({ws}*{value}{ws}*\)'.format(ws=WS, value=VALUE) # CSS escape pattern -RE_CSS_ESC = re.compile(r'(?:(\\[a-f0-9]{1,6}[ ]?)|(\\.))', re.I) +RE_CSS_ESC = re.compile(r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\.))'.format(ws=WS), re.I) # Pattern to break up `nth` specifiers -RE_NTH = re.compile(r'(?P[-+])?(?P\d+n?|n)(?:(?<=n)\s*(?P[-+])\s*(?P\d+))?', re.I) +RE_NTH = re.compile(r'(?P[-+])?(?P\d+n?|n)(?:(?<=n){ws}*(?P[-+]){ws}*(?P\d+))?'.format(ws=WS), re.I) SPLIT = ',' REL_HAS_CHILD = ": " @@ -437,7 +435,10 @@ def parse_split(self, sel, m, has_selector, selectors, relations, is_pseudo): relations.clear() else: sel.relations.extend(relations) - sel.rel_type = m.group('relation') + rel_type = m.group('relation').strip() + if not rel_type: + rel_type = ' ' + sel.rel_type = rel_type relations.clear() relations.append(sel) sel = _Selector() diff --git a/soupsieve/util.py b/soupsieve/util.py index f2a82847..50c2b254 100644 --- a/soupsieve/util.py +++ b/soupsieve/util.py @@ -14,8 +14,10 @@ DEFAULT_MODE = HTML5 TAG = bs4.Tag -CHILD = (TAG, bs4.Doctype, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction) COMMENT = bs4.Comment +DECLARATION = bs4.Declaration +CDATA = bs4.CData +PROC_INSTRUCT = bs4.ProcessingInstruction LC_A = ord('a') LC_Z = ord('z') diff --git a/tests/test_extra.py b/tests/test_extra.py index 8b215989..2b1fb81c 100644 --- a/tests/test_extra.py +++ b/tests/test_extra.py @@ -59,3 +59,18 @@ def test_contains(self): [], flags=sv.HTML5 ) + + def test_contains_escapes(self): + """Test tag.""" + + markup = """ +

Testing + thatcontains works.
+ """ + + self.assert_selector( + markup, + 'body span:contains("\nthat")', + ['2'], + flags=sv.HTML5 + ) From 87d6aa7879ff5c66af0d75b7f6cf8e3683d3cc55 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Thu, 13 Dec 2018 09:11:50 -0600 Subject: [PATCH 2/5] :contains should only parse CDATA in XML docs. Consider it ws in HTML. --- soupsieve/css_match.py | 7 ++++--- soupsieve/util.py | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/soupsieve/css_match.py b/soupsieve/css_match.py index 797981ee..ff87cf9c 100644 --- a/soupsieve/css_match.py +++ b/soupsieve/css_match.py @@ -414,12 +414,13 @@ def match_subselectors(self, el, selectors): match = False return match - def match_contains(self, el, contains): + def match_contains(self, el, contains, is_html): """Match element if it contains text.""" + types = (util.NAV_STRINGS,) if is_html else (util.NAV_STRINGS, util.CDATA) match = True for c in contains: - if c not in el.get_text(): + if c not in el.get_text(types=types): match = False break return match @@ -457,7 +458,7 @@ def match_selectors(self, el, selectors): # Verify relationship selectors if selector.relation and not self.match_relations(el, selector.relation): continue - if not self.match_contains(el, selector.contains): + if not self.match_contains(el, selector.contains, is_html): continue match = not is_not break diff --git a/soupsieve/util.py b/soupsieve/util.py index 50c2b254..85ce5f35 100644 --- a/soupsieve/util.py +++ b/soupsieve/util.py @@ -18,6 +18,7 @@ DECLARATION = bs4.Declaration CDATA = bs4.CData PROC_INSTRUCT = bs4.ProcessingInstruction +NAV_STRINGS = bs4.NavigableString LC_A = ord('a') LC_Z = ord('z') From e82ff4112fc2da9a965529211da1684641c259c6 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Thu, 13 Dec 2018 11:42:39 -0600 Subject: [PATCH 3/5] Never directly match the document itself. Must match an tag. --- docs/src/markdown/about/changelog.md | 1 + soupsieve/css_match.py | 2 +- tests/test_extra.py | 21 +++++++++++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md index 45dfce37..f728e5fd 100644 --- a/docs/src/markdown/about/changelog.md +++ b/docs/src/markdown/about/changelog.md @@ -4,6 +4,7 @@ - **FIX**: CSS selectors should be evaluated with CSS whitespace rules. - **FIX**: Processing instructions, CDATA, and declarations should all be ignored from child considerations for `:empty`. +- **FIX**: In Beautiful Soup, the document itself is the first tag. Do not match the "document" tag by returning false for any tag that doesn't have a parent. ## 1.0.0b1 diff --git a/soupsieve/css_match.py b/soupsieve/css_match.py index ff87cf9c..9e4a694e 100644 --- a/soupsieve/css_match.py +++ b/soupsieve/css_match.py @@ -468,7 +468,7 @@ def match_selectors(self, el, selectors): def match(self, el): """Match.""" - return isinstance(el, util.TAG) and self.match_selectors(el, self.selectors) + return isinstance(el, util.TAG) and el.parent and self.match_selectors(el, self.selectors) class SoupSieve(util.Immutable): diff --git a/tests/test_extra.py b/tests/test_extra.py index 2b1fb81c..812a41f7 100644 --- a/tests/test_extra.py +++ b/tests/test_extra.py @@ -74,3 +74,24 @@ def test_contains_escapes(self): ['2'], flags=sv.HTML5 ) + + def test_contains_cdata(self): + """Test tag.""" + + markup = """ +
Testing that contains works.
+ """ + + self.assert_selector( + markup, + 'body *:contains("that")', + ['1'], + flags=sv.HTML5 + ) + + self.assert_selector( + markup, + '*:contains("that")', + ['1', '2'], + flags=sv.XML + ) From 0aa7b2360f1064eb79e6ac3583cfa0fe4a4e18f5 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Thu, 13 Dec 2018 23:19:04 -0600 Subject: [PATCH 4/5] No CDATA when using :empty or :contains(). Also document flags are gone --- docs/src/markdown/about/changelog.md | 5 +- docs/src/markdown/about/development.md | 3 +- docs/src/markdown/api.md | 49 ++++--------- docs/src/markdown/selectors.md | 4 ++ soupsieve/__init__.py | 2 +- soupsieve/css_match.py | 83 +++++++++++---------- soupsieve/css_parser.py | 71 +++++++----------- soupsieve/css_types.py | 7 +- soupsieve/util.py | 24 ++++++- tests/test_extra.py | 17 +++-- tests/test_level1.py | 21 +++--- tests/test_level2.py | 49 +++++++------ tests/test_level3.py | 99 +++++++++++++------------- tests/test_level4.py | 40 +++++------ tests/test_soupsieve.py | 53 ++++++++++---- tests/util.py | 13 ++-- 16 files changed, 278 insertions(+), 262 deletions(-) diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md index f728e5fd..e70e32ad 100644 --- a/docs/src/markdown/about/changelog.md +++ b/docs/src/markdown/about/changelog.md @@ -1,9 +1,10 @@ # Changelog -## 1.0.0 +## 1.0.0b2 +- **NEW**: Drop document flags. Document type can be detected from the Beautiful Soup object directly. - **FIX**: CSS selectors should be evaluated with CSS whitespace rules. -- **FIX**: Processing instructions, CDATA, and declarations should all be ignored from child considerations for `:empty`. +- **FIX**: Processing instructions, CDATA, and declarations should all be ignored in `:contains` and child considerations for `:empty`. - **FIX**: In Beautiful Soup, the document itself is the first tag. Do not match the "document" tag by returning false for any tag that doesn't have a parent. ## 1.0.0b1 diff --git a/docs/src/markdown/about/development.md b/docs/src/markdown/about/development.md index 2c54d4eb..37abd2eb 100644 --- a/docs/src/markdown/about/development.md +++ b/docs/src/markdown/about/development.md @@ -220,7 +220,7 @@ class SelectorTag: class SelectorAttribute: """Selector attribute rule.""" - def __init__(self, attribute, prefix, pattern): + def __init__(self, attribute, prefix, pattern, xml_type_pattern): """Initialize.""" ``` @@ -229,6 +229,7 @@ class SelectorAttribute: `attribute` | Contains the attribute name to match. `prefix` | Contains the attribute namespace prefix to match if any. `pattern` | Contains a `re` regular expression object that matches the desired attribute value. +`xml_type_pattern` | As the default `type` pattern is case insensitive, when the attribute value is `type` and a case sensitivity has not been explicitly defined, a secondary case sensitive `type` pattern is compiled for use with XML documents when detected. ### `SelectorNth` diff --git a/docs/src/markdown/api.md b/docs/src/markdown/api.md index 53a56b57..e7c254ce 100644 --- a/docs/src/markdown/api.md +++ b/docs/src/markdown/api.md @@ -1,39 +1,12 @@ # API -## `soupsieve.HTML5` +Soup Sieve will detect the document type being used from the Beautiful Soup object that is given to it. For all HTML document types, it will treat tag names and attribute names without case sensitivity like most browsers do (even with XHTML). For HTML5, XHTML and XML, it will consider namespaces per the document's support (provided by the parser). To get namespaces support in HTML5, it is recommended to use `html5lib` as the parser. Some additional configuration is required when using namespaces, see [Namespace](#namespaces) for more information. -`HTML5` is a flag that instructs Soup Sieve to use HTML5 logic. When the `HTML5` flag is used, Soup Sieve will take into account namespaces for known embedded HTML5 namespaces such as SVG. `HTML5` will also not compare tag names and attribute names with case sensitivity. +While attribute values are always generally treated as case sensitive, HTML5, XHTML, and HTML treat the `type` attribute special. The `type` attribute's value is always case insensitive. This is generally how most browsers treat `type`. If you need `type` to be sensitive, you can use the `s` flag: `#!css [type="submit" s]`. -!!! tip - While attribute values are always treated as case sensitive, HTML5, XHTML, and HTML treat the `type` attribute special, `type`'s value is always case insensitive. This is generally how most browsers treat `type`. +## Flags - If you need `type` to be sensitive, you can use the `s` flag: `#!css [type="submit" s]`. - -Keep in mind, that Soup Sieve itself is not responsible for deciding what tag has or does not have a namespace. This is actually determined by the parser used in Beautiful Soup. This flag only tells Soup Sieve that the parser should be calculating namespaces, so it is okay to look at them. The user is responsible for using an appropriate parser for HTML5. If using the [lxml][lxml] or [html5lib][html5lib] with Beautiful Soup, HTML5 namespaces *should* be accounted for in the parsing. If you are using Python's builtin HTML parser, this may not be the case. - -## `soupsieve.HTML` - -`HTML` is a flag that instructs Soup Sieve to use pre HTML5 logic. When the `HTML` flag is used, Soup Sieve will not consider namespaces when evaluating elements. `HTML` will also not compare tag names and attribute names with case sensitivity. - -!!! tip - While attribute values are always treated as case sensitive, HTML5, XHTML, and HTML treat the `type` attribute special, `type`'s value is always case insensitive. This is generally how most browsers treat `type`. - - If you need `type` to be sensitive, you can use the `s` flag: `#!css [type="submit" s]`. - -## `soupsieve.XML` - -`XML` is a flag that instructs Soup Sieve to use XML logic. `XML` will cause Soup Sieve to take namespaces into considerations, and it will evaluate tag names and attribute names with case sensitivity. It will also relax what it considers valid tag name and attribute characters. It will also disable `.class` and `#id` selectors this is more an HTML concept. - -## `soupsieve.XHTML` - -`XHTML` is a flag that instructs Soup Sieve to use XHTML logic. This will cause Soup Sieve to take namespaces into considerations, and evaluate tag names and attributes names with no case sensitivity as this is how most browsers deal with XHTML tags. `.class` and `#id` are perfectly valid in XHTML. - -!!! tip - While attribute values are always treated as case sensitive, HTML5, XHTML, and HTML treat the `type` attribute special, `type`'s value is always case insensitive. This is generally how most browsers treat `type`. - - If you need `type` to be sensitive, you can use the `s` flag: `#!css [type="submit" s]`. - -It is recommend to use the `xml` mode in Beautiful Soup when parsing XHTML documents. +There are no flags at this time, but the parameter is provided for potential future use. ## `soupsieve.select()` @@ -44,7 +17,7 @@ def select(select, node, namespaces=None, limit=0, flags=0): `select` given a tag, will select all tags that match the provided CSS selector string. You can give `limit` a positive integer to return a specific number tags (0 means to return all tags). -`select` accepts a CSS selector string, a `node` or element, an optional [namespace](#namespaces) dictionary, a `limit`, and `flags`. If no flags are specified, HTML5 mode will be assumed. +`select` accepts a CSS selector string, a `node` or element, an optional [namespace](#namespaces) dictionary, a `limit`, and `flags`. ```pycon3 >>> import soupsieve as sv @@ -64,13 +37,13 @@ def iselect(select, node, namespaces=None, limit=0, flags=0): ## `soupsieve.match()` ```py3 -def match(select, node, namespaces=None, mode=0): +def match(select, node, namespaces=None, flags=0): """Match node.""" ``` `match` matches a given node/element with a given CSS selector. -`match` accepts a CSS selector string, a `node` or element, an optional [namespace](#namespaces) dictionary, and flags. If no flags are specified, HTML5 mode will be assumed. +`match` accepts a CSS selector string, a `node` or element, an optional [namespace](#namespaces) dictionary, and flags. ```pycon3 >>> nodes = sv.select('p:is(.a, .b, .c)', soup) @@ -89,7 +62,7 @@ def filter(select, nodes, namespaces=None, flags=0): `filter` takes an iterable containing HTML nodes and will filter them based on the provided CSS selector string. If given a Beautiful Soup tag, it will iterate the children that are tags. -`filter` accepts a CSS selector string, an iterable containing tags, an optional [namespace](#namespaces) dictionary, and flags. If no flags are specified, HTML5 mode will be assumed. +`filter` accepts a CSS selector string, an iterable containing tags, an optional [namespace](#namespaces) dictionary, and flags. ```pycon3 >>> sv.filter('p:not(.b)', soup.div) @@ -105,7 +78,7 @@ def comments(node, limit=0, flags=0): `comments` if useful to extract all comments from a document or document tag. It will extract from the given tag down through all of its children. You can limit how many comments are returned with `limit`. -`comments` accepts a `node` or element, a `limit`, and a flags. If no flags are specified, HTML5 mode will be assumed. +`comments` accepts a `node` or element, a `limit`, and flags. ## `soupsieve.icomments()` @@ -173,3 +146,7 @@ namespace = { ``` Tags do not necessarily have to have a prefix for Soup Sieve to recognize them. For instance, in HTML5, SVG *should* automatically get the SVG namespace. Depending how namespaces were defined in the documentation, tags may inherit namespaces in some conditions. Namespace assignment is mainly handled by the parser and exposed through the Beautiful Soup API. Soup Sieve uses the Beautiful Soup API to then compare namespaces when the appropriate document that supports namespaces is set. + +--8<-- +refs.txt +--8<-- diff --git a/docs/src/markdown/selectors.md b/docs/src/markdown/selectors.md index 1661ab94..39febb07 100644 --- a/docs/src/markdown/selectors.md +++ b/docs/src/markdown/selectors.md @@ -65,3 +65,7 @@ Just because we include selectors from one source, does not mean we have intenti Selector | Example | Description ------------------------------- | ----------------------------------- | ----------- `:contains(text)` | `#!css p:contains(text)` | Select all `#!html

` elements that contain "text" in their content, either directly in themselves or indirectly in their decedents. + +--8<-- +refs.txt +--8<-- diff --git a/soupsieve/__init__.py b/soupsieve/__init__.py index 05cd4ec8..5bfd26bb 100644 --- a/soupsieve/__init__.py +++ b/soupsieve/__init__.py @@ -40,7 +40,7 @@ SoupSieve = cm.SoupSieve -def compile(pattern, namespaces=None, flags=HTML5): # noqa: A001 +def compile(pattern, namespaces=None, flags=0): # noqa: A001 """Compile CSS pattern.""" if namespaces is None: diff --git a/soupsieve/css_match.py b/soupsieve/css_match.py index 9e4a694e..7dfd0c29 100644 --- a/soupsieve/css_match.py +++ b/soupsieve/css_match.py @@ -19,6 +19,8 @@ REL_HAS_SIBLING = ':~' REL_HAS_CLOSE_SIBLING = ':+' +NS_XHTML = 'http://www.w3.org/1999/xhtml' + class CSSMatch: """Perform CSS matching.""" @@ -29,9 +31,6 @@ def __init__(self, selectors, namespaces, flags): self.selectors = selectors self.namespaces = namespaces self.flags = flags - self.mode = flags & util.MODE_MSK - if self.mode == 0: - self.mode == util.DEFAULT_MODE def get_namespace(self, el): """Get the namespace for the element.""" @@ -45,18 +44,12 @@ def get_namespace(self, el): def supports_namespaces(self): """Check if namespaces are supported in the HTML type.""" - return self.mode in (util.HTML5, util.XHTML, util.XML) - - def is_xml(self): - """Check if document is an XML type.""" - - return self.mode in (util.XHTML, util.XML) + return self.is_xml or self.html_namespace def get_attribute(self, el, attr, prefix): """Get attribute from element if it exists.""" value = None - is_xml = self.is_xml() if self.supports_namespaces(): value = None # If we have not defined namespaces, we can't very well find them, so don't bother trying. @@ -81,7 +74,7 @@ def get_attribute(self, el, attr, prefix): # We can't match our desired prefix attribute as the attribute doesn't have a prefix if prefix and not p and prefix != '*': continue - if is_xml: + if self.is_xml: # The prefix doesn't match if prefix and p and prefix != '*' and prefix != p: continue @@ -140,17 +133,15 @@ def match_attributes(self, el, attributes): if attributes: for a in attributes: value = self.get_attribute(el, a.attribute, a.prefix) + pattern = a.xml_type_pattern if not self.html_namespace and a.xml_type_pattern else a.pattern if isinstance(value, list): value = ' '.join(value) - if a.pattern is None and value is None: - match = False - break - elif a.pattern is not None and value is None: + if value is None: match = False break - elif a.pattern is None: + elif pattern is None: continue - elif value is None or a.pattern.match(value) is None: + elif pattern.match(value) is None: match = False break return match @@ -160,7 +151,7 @@ def match_tagname(self, el, tag): return not ( tag.name and - tag.name not in ((util.lower(el.name) if not self.is_xml() else el.name), '*') + tag.name not in ((util.lower(el.name) if not self.is_xml else el.name), '*') ) def match_tag(self, el, tag): @@ -284,7 +275,7 @@ def match_nth_tag_type(self, el, child): """Match tag type for `nth` matches.""" return( - (child.name == (util.lower(el.name) if not self.is_xml() else el.name)) and + (child.name == (util.lower(el.name) if not self.is_xml else el.name)) and (not self.supports_namespaces() or self.get_namespace(child) == self.get_namespace(el)) ) @@ -295,8 +286,6 @@ def match_nth(self, el, nth): for n in nth: matched = False - if not el.parent: - break if n.selectors and not self.match_selectors(el, n.selectors): break parent = el.parent @@ -390,20 +379,22 @@ def match_nth(self, el, nth): break return matched - def has_child(self, el): - """Check if element has child.""" - - found_child = False - for child in el.children: - if isinstance(child, util.TAG): - found_child = True - break - return found_child - def match_empty(self, el, empty): """Check if element is empty (if requested).""" - return not empty or (RE_NOT_EMPTY.search(el.text) is None and not self.has_child(el)) + is_empty = True + if empty: + for child in el.children: + if isinstance(child, util.TAG): + is_empty = False + break + elif ( + (isinstance(child, util.NAV_STRINGS) and not isinstance(child, util.NON_CONTENT_STRINGS)) and + RE_NOT_EMPTY.search(child) + ): + is_empty = False + break + return is_empty def match_subselectors(self, el, selectors): """Match selectors.""" @@ -414,10 +405,10 @@ def match_subselectors(self, el, selectors): match = False return match - def match_contains(self, el, contains, is_html): + def match_contains(self, el, contains): """Match element if it contains text.""" - types = (util.NAV_STRINGS,) if is_html else (util.NAV_STRINGS, util.CDATA) + types = (util.NAV_STRINGS,) if not self.is_xml else (util.NAV_STRINGS, util.CDATA) match = True for c in contains: if c not in el.get_text(types=types): @@ -429,7 +420,6 @@ def match_selectors(self, el, selectors): """Check if element matches one of the selectors.""" match = False - is_html = self.mode != util.XML is_not = selectors.is_not for selector in selectors: match = is_not @@ -442,10 +432,10 @@ def match_selectors(self, el, selectors): if not self.match_empty(el, selector.empty): continue # Verify id matches - if is_html and selector.ids and not self.match_id(el, selector.ids): + if selector.ids and not self.match_id(el, selector.ids): continue # Verify classes match - if is_html and selector.classes and not self.match_classes(el, selector.classes): + if selector.classes and not self.match_classes(el, selector.classes): continue # Verify attribute(s) match if not self.match_attributes(el, selector.attributes): @@ -458,16 +448,33 @@ def match_selectors(self, el, selectors): # Verify relationship selectors if selector.relation and not self.match_relations(el, selector.relation): continue - if not self.match_contains(el, selector.contains, is_html): + if not self.match_contains(el, selector.contains): continue match = not is_not break return match + def is_html_ns(self, el): + """Check if in HTML namespace.""" + + ns = getattr(el, 'namespace') if el else None + return ns and ns == NS_XHTML + def match(self, el): """Match.""" + doc = el + while doc.parent: + doc = doc.parent + root = None + for child in doc.children: + if isinstance(child, util.TAG): + root = child + break + self.html_namespace = self.is_html_ns(root) + self.is_xml = doc.is_xml and not self.html_namespace + return isinstance(el, util.TAG) and el.parent and self.match_selectors(el, self.selectors) diff --git a/soupsieve/css_parser.py b/soupsieve/css_parser.py index b4f823ac..ab49d51c 100644 --- a/soupsieve/css_parser.py +++ b/soupsieve/css_parser.py @@ -23,20 +23,13 @@ PAT_CLASS = r'\.(?:[-\w]|{esc})+'.format(esc=CSS_ESCAPES) -PAT_HTML_TAG = r'(?:(?:(?:[-\w]|{esc})+|\*)?\|)?(?:(?:[-\w]|{esc})+|\*)'.format(esc=CSS_ESCAPES) +PAT_TAG = r'(?:(?:(?:[-\w]|{esc})+|\*)?\|)?(?:(?:[-\w]|{esc})+|\*)'.format(esc=CSS_ESCAPES) -PAT_XML_TAG = r'(?:(?:(?:[-\w.]|{esc})+|\*)?\|)?(?:(?:[-\w.]|{esc})+|\*)'.format(esc=CSS_ESCAPES) - -PAT_HTML_ATTR = r'''(?x) +PAT_ATTR = r'''(?x) \[{ws}*(?P(?:(?:(?:[-\w]|{esc})+|\*)?\|)?(?:[-\w]|{esc})+) {attr} '''.format(ws=WS, esc=CSS_ESCAPES, attr=ATTR) -PAT_XML_ATTR = r'''(?x) -\[{ws}*(?P(?:(?:(?:[-\w]|{esc})+|\*)?\|)?(?:[-\w.]|{esc})+) -{attr} -'''.format(ws=WS, esc=CSS_ESCAPES, attr=ATTR) - PAT_PSEUDO_OPEN = r':(?:has|is|matches|not|where)\(' PAT_PSEUDO_CLOSE = r'{ws}*\)'.format(ws=WS) @@ -113,24 +106,6 @@ def enabled(self, flags): return True -class HtmlSelectorPattern(SelectorPattern): - """HTML selector pattern.""" - - def enabled(self, flags): - """Enabled.""" - - return (flags & util.MODE_MSK) in (util.HTML, util.HTML5, util.XHTML) - - -class XmlSelectorPattern(SelectorPattern): - """XML selector pattern.""" - - def enabled(self, flags): - """Enabled.""" - - return (flags & util.MODE_MSK) in (util.XML,) - - class _Selector: """ Intermediate selector class. @@ -206,12 +181,10 @@ class CSSParser: ("contains", SelectorPattern(PAT_CONTAINS)), ("pseudo_nth_child", SelectorPattern(PAT_PSEUDO_NTH_CHILD)), ("pseudo_nth_type", SelectorPattern(PAT_PSEUDO_NTH_TYPE)), - ("id", HtmlSelectorPattern(PAT_ID)), - ("class", HtmlSelectorPattern(PAT_CLASS)), - ("html_tag", HtmlSelectorPattern(PAT_HTML_TAG)), - ("xml_tag", XmlSelectorPattern(PAT_XML_TAG)), - ("html_attribute", HtmlSelectorPattern(PAT_HTML_ATTR)), - ("xml_attribute", XmlSelectorPattern(PAT_XML_ATTR)), + ("id", SelectorPattern(PAT_ID)), + ("class", SelectorPattern(PAT_CLASS)), + ("tag", SelectorPattern(PAT_TAG)), + ("attribute", SelectorPattern(PAT_ATTR)), ("pseudo_close", SelectorPattern(PAT_PSEUDO_CLOSE)), ("combine", SelectorPattern(PAT_SPLIT)) ] @@ -222,13 +195,11 @@ def __init__(self, selector, flags=0): self.pattern = selector self.flags = flags - mode = flags & util.MODE_MSK - - if mode in (util.HTML, util.HTML5, util.XML, util.XHTML, 0): - self.mode = mode if mode else util.DEFAULT_MODE - else: - raise ValueError("Invalid SelectorMatcher flag(s) '{}'".format(mode)) - self.adjusted_flags = flags | self.mode + dflags = self.flags & util.DEPRECATED_FLAGS + if dflags: + util.warn_deprecated( + "The following flags are deprecated and may be repurposed in the future '0x%02X'" % dflags + ) def parse_attribute_selector(self, sel, m, has_selector): """Create attribute selector from the returned regex match.""" @@ -236,6 +207,8 @@ def parse_attribute_selector(self, sel, m, has_selector): case = util.lower(m.group('case').strip()) if m.group('case') else None parts = [css_unescape(a.strip()) for a in m.group('ns_attr').split('|')] ns = '' + is_type = False + pattern2 = None if len(parts) > 1: ns = parts[0] attr = parts[1] @@ -243,10 +216,12 @@ def parse_attribute_selector(self, sel, m, has_selector): attr = parts[0] if case: flags = re.I if case == 'i' else 0 - elif self.mode == util.XML: - flags = 0 + elif util.lower(attr) == 'type': + flags = re.I + is_type = True else: - flags = re.I if util.lower(attr) == 'type' and not ns else 0 + flags = 0 + op = m.group('cmp') if op: value = css_unescape( @@ -275,8 +250,10 @@ def parse_attribute_selector(self, sel, m, has_selector): else: # Value matches pattern = re.compile(r'^%s$' % re.escape(value), flags) + if is_type: + pattern2 = re.compile(pattern.pattern) has_selector = True - sel.attributes.append(ct.SelectorAttribute(attr, ns, pattern)) + sel.attributes.append(ct.SelectorAttribute(attr, ns, pattern, pattern2)) return has_selector def parse_tag_pattern(self, sel, m, has_selector): @@ -513,9 +490,9 @@ def parse_selectors(self, iselector, is_pseudo=False, is_not=False, is_has=False has_selector, sel = self.parse_split(sel, m, has_selector, selectors, relations, is_pseudo) split_last = True continue - elif key in ('html_attribute', 'xml_attribute'): + elif key == 'attribute': has_selector = self.parse_attribute_selector(sel, m, has_selector) - elif key in ('html_tag', 'xml_tag'): + elif key == 'tag': if has_selector: raise SyntaxError("Tag must come first") has_selector = self.parse_tag_pattern(sel, m, has_selector) @@ -556,7 +533,7 @@ def selector_iter(self, pattern): while index <= end: m = None for k, v in self.css_tokens.items(): - if not v.enabled(self.adjusted_flags): + if not v.enabled(self.flags): # pragma: no cover continue m = v.pattern.match(pattern, index) if m: diff --git a/soupsieve/css_types.py b/soupsieve/css_types.py index 14514b4a..2dd7a5ec 100644 --- a/soupsieve/css_types.py +++ b/soupsieve/css_types.py @@ -67,15 +67,16 @@ def __init__(self, name, prefix): class SelectorAttribute(util.Immutable): """Selector attribute rule.""" - __slots__ = ("attribute", "prefix", "pattern", "_hash") + __slots__ = ("attribute", "prefix", "pattern", "xml_type_pattern", "_hash") - def __init__(self, attribute, prefix, pattern): + def __init__(self, attribute, prefix, pattern, xml_type_pattern): """Initialize.""" super().__init__( attribute=attribute, prefix=prefix, - pattern=pattern + pattern=pattern, + xml_type_pattern=xml_type_pattern ) diff --git a/soupsieve/util.py b/soupsieve/util.py index 85ce5f35..3346346d 100644 --- a/soupsieve/util.py +++ b/soupsieve/util.py @@ -9,9 +9,7 @@ HTML = 0x2 XHTML = 0x4 XML = 0x8 - -MODE_MSK = 0xF -DEFAULT_MODE = HTML5 +DEPRECATED_FLAGS = HTML5 | HTML | XHTML | XML TAG = bs4.Tag COMMENT = bs4.Comment @@ -19,6 +17,7 @@ CDATA = bs4.CData PROC_INSTRUCT = bs4.ProcessingInstruction NAV_STRINGS = bs4.NavigableString +NON_CONTENT_STRINGS = (COMMENT, DECLARATION, CDATA, PROC_INSTRUCT) LC_A = ord('a') LC_Z = ord('z') @@ -93,6 +92,15 @@ def __setattr__(self, name, value): raise AttributeError("'{}' is immutable".format(self.__class__.__name__)) + def __repr__(self): # pragma: no cover + """Representation.""" + + return "{}({})".format( + self.__base__(), ', '.join(["{}={!r}".format(k, getattr(self, k)) for k in self.__slots__[:-1]]) + ) + + __str__ = __repr__ + class ImmutableDict(Mapping): """Hashable, immutable dictionary.""" @@ -156,3 +164,13 @@ def _func(*args, **kwargs): return func(*args, **kwargs) return _func return _decorator + + +def warn_deprecated(message, stacklevel=2): + """Warn deprecated.""" + + warnings.warn( + message, + category=DeprecationWarning, + stacklevel=stacklevel + ) diff --git a/tests/test_extra.py b/tests/test_extra.py index 812a41f7..9bbf3aae 100644 --- a/tests/test_extra.py +++ b/tests/test_extra.py @@ -8,7 +8,6 @@ ``` """ from . import util -import soupsieve as sv class TestLevel1(util.TestCase): @@ -29,35 +28,35 @@ def test_contains(self): markup, 'body span:contains(that)', ['2'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, 'body span:contains(" that ")', ['2'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, 'body :contains(" that ")', ['1', '2'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, 'body :contains( "Testing" )', ['1'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, 'body :contains(bad)', [], - flags=sv.HTML5 + flags=util.HTML5 ) def test_contains_escapes(self): @@ -72,7 +71,7 @@ def test_contains_escapes(self): markup, 'body span:contains("\nthat")', ['2'], - flags=sv.HTML5 + flags=util.HTML5 ) def test_contains_cdata(self): @@ -86,12 +85,12 @@ def test_contains_cdata(self): markup, 'body *:contains("that")', ['1'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, '*:contains("that")', ['1', '2'], - flags=sv.XML + flags=util.XML ) diff --git a/tests/test_level1.py b/tests/test_level1.py index 86fbbfea..7eb24dd2 100644 --- a/tests/test_level1.py +++ b/tests/test_level1.py @@ -17,7 +17,6 @@ - `:active`: No elements in our environment can be "active", so this makes no sense in our context. """ from . import util -import soupsieve as sv class TestLevel1(util.TestCase): @@ -35,7 +34,7 @@ def test_tag(self): """, "span", ["1"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_tags(self): @@ -51,7 +50,7 @@ def test_tags(self): """, "span, a", ["1", "2"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_child(self): @@ -67,7 +66,7 @@ def test_child(self): """, "div span", ["1"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_id(self): @@ -85,14 +84,14 @@ def test_id(self): markup, "#1", ["1"], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "a#2", ["2"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_class(self): @@ -110,21 +109,21 @@ def test_class(self): markup, ".foo", ["1"], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "a.bar", ["2"], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, ".foo", ["1"], - flags=sv.XHTML + flags=util.XHTML ) def test_classes(self): @@ -144,7 +143,7 @@ def test_classes(self): markup, "a.foo.bar", ["4"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_escapes(self): @@ -162,5 +161,5 @@ def test_escapes(self): markup, ".foo\\:bar\\3a foobar", ["1"], - flags=sv.HTML5 + flags=util.HTML5 ) diff --git a/tests/test_level2.py b/tests/test_level2.py index 22e141cb..eedd9281 100644 --- a/tests/test_level2.py +++ b/tests/test_level2.py @@ -23,7 +23,6 @@ - `:focus`: Items cannot be focused in our environment, so this has little meaning and will not be implemented. """ from . import util -import soupsieve as sv class TestLevel2(util.TestCase): @@ -50,7 +49,7 @@ def test_direct_child(self): markup, "div > span", ["3"], - flags=sv.HTML5 + flags=util.HTML5 ) # No spaces @@ -58,7 +57,7 @@ def test_direct_child(self): markup, "div>span", ["3"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_direct_sibling(self): @@ -82,7 +81,7 @@ def test_direct_sibling(self): markup, "span + span", ["5", "6"], - flags=sv.HTML5 + flags=util.HTML5 ) # No spaces @@ -90,7 +89,7 @@ def test_direct_sibling(self): markup, "span+span", ["5", "6"], - flags=sv.HTML5 + flags=util.HTML5 ) # Complex @@ -98,7 +97,7 @@ def test_direct_sibling(self): markup, "span#4 + span#5", ["5"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_wild_tag(self): @@ -119,7 +118,7 @@ def test_wild_tag(self): """, "body *", ["0", "1", "2", "3", "4", "5", "6", "div", "pre"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_attribute(self): @@ -142,7 +141,7 @@ def test_attribute(self): markup, "[href]", ["2"], - flags=sv.HTML5 + flags=util.HTML5 ) # With spaces @@ -150,7 +149,7 @@ def test_attribute(self): markup, "[ href ]", ["2"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_multi_attribute(self): @@ -172,7 +171,7 @@ def test_multi_attribute(self): """, "span[id].test[data-test=test]", ["5"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_attribute_equal(self): @@ -196,7 +195,7 @@ def test_attribute_equal(self): markup, '[id=5]', ["5"], - flags=sv.HTML5 + flags=util.HTML5 ) # Single quoted @@ -204,7 +203,7 @@ def test_attribute_equal(self): markup, "[id='5']", ["5"], - flags=sv.HTML5 + flags=util.HTML5 ) # Double quoted @@ -212,7 +211,7 @@ def test_attribute_equal(self): markup, '[id="5"]', ["5"], - flags=sv.HTML5 + flags=util.HTML5 ) # With spaces @@ -220,35 +219,35 @@ def test_attribute_equal(self): markup, '[ id = "5" ]', ["5"], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, '[ID="5"]', ["5"], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, '[ id = "5" ]', ["5"], - flags=sv.HTML + flags=util.HTML ) self.assert_selector( markup, '[ID="5"]', ["5"], - flags=sv.HTML + flags=util.HTML ) self.assert_selector( '', '[ id = "5" ]', [], - flags=sv.HTML + flags=util.HTML ) def test_attribute_type(self): @@ -275,14 +274,14 @@ def test_attribute_type(self): markup, '[type="test"]', ["0", '2'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, '[type="test"]', ['2'], - flags=sv.XML + flags=util.XML ) def test_attribute_start_dash(self): @@ -303,7 +302,7 @@ def test_attribute_start_dash(self): """, "[lang|=en]", ["0"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_attribute_contains_space(self): @@ -327,7 +326,7 @@ def test_attribute_contains_space(self): markup, "[class~=test2]", ["0"], - flags=sv.HTML5 + flags=util.HTML5 ) # Start of list @@ -335,7 +334,7 @@ def test_attribute_contains_space(self): markup, "[class~=test-a]", ["pre"], - flags=sv.HTML5 + flags=util.HTML5 ) # End of list @@ -343,7 +342,7 @@ def test_attribute_contains_space(self): markup, "[class~=test-b]", ["pre"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_first_child(self): @@ -364,5 +363,5 @@ def test_first_child(self): """, "span:first-child", ["1", "4"], - flags=sv.HTML5 + flags=util.HTML5 ) diff --git a/tests/test_level3.py b/tests/test_level3.py index 76f4184e..9d720095 100644 --- a/tests/test_level3.py +++ b/tests/test_level3.py @@ -33,7 +33,6 @@ Is this even useful in the context of how Soup Sieve would be used? """ from . import util -import soupsieve as sv class TestLevel3(util.TestCase): @@ -57,7 +56,7 @@ def test_distant_sibling(self): """, "p ~ span", ["3"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_not(self): @@ -80,21 +79,21 @@ def test_not(self): markup, 'div :not([id="1"])', ["0", "2", "3", "4", "5", "6", "pre"], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, 'div :NOT([id="1"])', ["0", "2", "3", "4", "5", "6", "pre"], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, 'span:not([id="1"])', ["3", "4", "5", "6"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_attribute_begins(self): @@ -115,7 +114,7 @@ def test_attribute_begins(self): """, "[class^=here]", ["0"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_attribute_end(self): @@ -136,7 +135,7 @@ def test_attribute_end(self): """, "[class$=words]", ["0"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_attribute_contains(self): @@ -159,7 +158,7 @@ def test_attribute_contains(self): markup, "[class*=words]", ["0", "3", "pre"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_root(self): @@ -189,14 +188,14 @@ def test_root(self): markup, ":root", ["root"], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, ":root > body > div", ["div"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_empty(self): @@ -220,7 +219,7 @@ def test_empty(self): markup, "body :empty", ["4", "5", "6", "8"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_last_child(self): @@ -243,14 +242,14 @@ def test_last_child(self): markup, "span:last-child", ["1", "6"], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "span:LAST-CHILD", ["1", "6"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_only_child(self): @@ -271,7 +270,7 @@ def test_only_child(self): """, "span:only-child", ["1"], - flags=sv.HTML5 + flags=util.HTML5 ) def test_namespace(self): @@ -313,7 +312,7 @@ def test_namespace(self): "foo": "http://me.com/namespaces/foofoo", "bar": "http://me.com/namespaces/foobar" }, - flags=sv.XML + flags=util.XML ) self.assert_selector( @@ -324,7 +323,7 @@ def test_namespace(self): "foo": "http://me.com/namespaces/foofoo", "bar": "http://me.com/namespaces/foobar" }, - flags=sv.XML + flags=util.XML ) self.assert_selector( @@ -335,7 +334,7 @@ def test_namespace(self): "foo": "http://me.com/namespaces/foofoo", "bar": "http://me.com/namespaces/foobar" }, - flags=sv.XML + flags=util.XML ) self.assert_selector( @@ -346,7 +345,7 @@ def test_namespace(self): "foo": "http://me.com/namespaces/foofoo", "bar": "http://me.com/namespaces/foobar" }, - flags=sv.XML + flags=util.XML ) # Because we employ level 4 selectors @@ -359,7 +358,7 @@ def test_namespace(self): "foo": "http://me.com/namespaces/foofoo", "bar": "http://me.com/namespaces/foobar" }, - flags=sv.XML + flags=util.XML ) # Now that we apply a default namespace. Null space. @@ -372,7 +371,7 @@ def test_namespace(self): "foo": "http://me.com/namespaces/foofoo", "bar": "http://me.com/namespaces/foobar" }, - flags=sv.XML + flags=util.XML ) self.assert_selector( @@ -384,7 +383,7 @@ def test_namespace(self): "foo": "http://me.com/namespaces/foofoo", "bar": "http://me.com/namespaces/foobar" }, - flags=sv.XML + flags=util.XML ) # Because no prefix is specified for "other" in the above document, @@ -400,7 +399,7 @@ def test_namespace(self): "bar": "http://me.com/namespaces/foobar", "other": "http://me.com/namespaces/other" }, - flags=sv.XML + flags=util.XML ) def test_attribute_namespace(self): @@ -441,7 +440,7 @@ def test_attribute_namespace(self): '[xlink|href*=forw],[xlink|href="images/sprites.svg#icon-redo"]', ['1', '2'], namespaces={"xlink": "http://www.w3.org/1999/xlink"}, - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( @@ -449,7 +448,7 @@ def test_attribute_namespace(self): '[bad|href*=forw]', [], namespaces={"xlink": "http://www.w3.org/1999/xlink"}, - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( @@ -457,7 +456,7 @@ def test_attribute_namespace(self): '[\\:href]', ['4'], namespaces={"xlink": "http://www.w3.org/1999/xlink"}, - flags=sv.HTML5 + flags=util.HTML5 ) def test_attribute_namespace_xhtml(self): @@ -499,7 +498,7 @@ def test_attribute_namespace_xhtml(self): '[xlink|href*=forw],[xlink|href="images/sprites.svg#icon-redo"]', ['1', '2'], namespaces={"xlink": "http://www.w3.org/1999/xlink"}, - flags=sv.XHTML + flags=util.XHTML ) def test_first_of_type(self): @@ -524,21 +523,21 @@ def test_first_of_type(self): markup, "p:first-of-type", ['0'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "span:first-of-type", ['2'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "body :first-of-type", ['0', '2'], - flags=sv.HTML5 + flags=util.HTML5 ) def test_last_of_type(self): @@ -563,21 +562,21 @@ def test_last_of_type(self): markup, "p:last-of-type", ['10'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "span:last-of-type", ['11'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "body :last-of-type", ['10', '11'], - flags=sv.HTML5 + flags=util.HTML5 ) def test_only_of_type(self): @@ -602,7 +601,7 @@ def test_only_of_type(self): markup, "p:only-of-type", ['1', '4'], - flags=sv.HTML5 + flags=util.HTML5 ) def test_nth_child(self): @@ -657,63 +656,63 @@ def test_nth_child(self): markup, "p:nth-child(-2)", [], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "p:nth-child(2)", ['1'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "p:nth-child(9n - 1)", ['7'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "p:nth-child(2n + 1)", ['0', '8', '10'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "p:nth-child(-n+3)", ['0', '1'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "span:nth-child(-n+3)", ['2'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "body *:nth-child(-n+3)", ['0', '1', '2'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "p:nth-child(odd)", ['0', '8', '10'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "p:nth-child(even)", ['1', '7', '9'], - flags=sv.HTML5 + flags=util.HTML5 ) def test_nth_last_child(self): @@ -738,14 +737,14 @@ def test_nth_last_child(self): markup, "p:nth-last-child(2)", ['10'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "p:nth-last-child(2n + 1)", ['1', '7', '9'], - flags=sv.HTML5 + flags=util.HTML5 ) def test_nth_of_type(self): @@ -770,28 +769,28 @@ def test_nth_of_type(self): markup, "p:nth-of-type(3)", ['7'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "p:nth-of-type(2n + 1)", ['0', '7', '9'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "span:nth-of-type(2n + 1)", ['2', '4', '6'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "body :nth-of-type(2n + 1)", ['0', '2', '4', '6', '7', '9'], - flags=sv.HTML5 + flags=util.HTML5 ) def test_nth_last_of_type(self): @@ -816,12 +815,12 @@ def test_nth_last_of_type(self): markup, "p:nth-last-of-type(3)", ['8'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "p:nth-last-of-type(2n + 1)", ['1', '8', '10'], - flags=sv.HTML5 + flags=util.HTML5 ) diff --git a/tests/test_level4.py b/tests/test_level4.py index 1271c3bb..acc43d66 100644 --- a/tests/test_level4.py +++ b/tests/test_level4.py @@ -95,14 +95,14 @@ def test_attribute_case(self): markup, "[class*=WORDS]", [], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, "[class*=WORDS i]", ["0", "3", "pre"], - flags=sv.HTML5 + flags=util.HTML5 ) with self.assertRaises(SyntaxError): @@ -132,14 +132,14 @@ def test_attribute_type_case(self): markup, '[type="test" s]', ['2'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, '[type="test" i]', ['0', '2'], - flags=sv.XML + flags=util.XML ) def test_is_matches_where(self): @@ -157,21 +157,21 @@ def test_is_matches_where(self): markup, ":is(span, a)", ["1", "2"], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, ":is(span, a:matches(#2))", ["1", "2"], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, ":where(span, a:matches(#2))", ["1", "2"], - flags=sv.HTML5 + flags=util.HTML5 ) # Each pseudo class is evaluated separately @@ -180,7 +180,7 @@ def test_is_matches_where(self): markup, ":is(span):not(span)", [], - flags=sv.HTML5 + flags=util.HTML5 ) # Each pseudo class is evaluated separately @@ -189,7 +189,7 @@ def test_is_matches_where(self): markup, ":is(span):is(div)", [], - flags=sv.HTML5 + flags=util.HTML5 ) # Each pseudo class is evaluated separately @@ -198,7 +198,7 @@ def test_is_matches_where(self): markup, ":is(a):is(#2)", ['2'], - flags=sv.HTML5 + flags=util.HTML5 ) def test_multi_nested_not(self): @@ -221,7 +221,7 @@ def test_multi_nested_not(self): markup, 'div :not(p, :not([id=5]))', ['5'], - flags=sv.HTML5 + flags=util.HTML5 ) def test_has(self): @@ -270,49 +270,49 @@ def test_has(self): markup, 'div:not(.aaaa):has(.kkkk > p.llll)', ['4', '5', '6'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, 'div:NOT(.aaaa):HAS(.kkkk > p.llll)', ['4', '5', '6'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, 'p:has(+ .dddd:has(+ div .jjjj))', ['2'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, 'p:has(~ .jjjj)', ['7', '8'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup2, 'div:has(> .bbbb, .ffff, .jjjj)', ['0', '4', '8'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup2, 'div:has(> :not(.bbbb, .ffff, .jjjj))', ['2', '6', '8'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup2, 'div:not(:has(> .bbbb, .ffff, .jjjj))', ['2', '6'], - flags=sv.HTML5 + flags=util.HTML5 ) def test_nth_child_of_s(self): @@ -337,12 +337,12 @@ def test_nth_child_of_s(self): markup, ":nth-child(2n + 1 of :is(p, span).test)", ['2', '6', '10'], - flags=sv.HTML5 + flags=util.HTML5 ) self.assert_selector( markup, ":nth-child(-n+3 of p)", ['0', '1', '7'], - flags=sv.HTML5 + flags=util.HTML5 ) diff --git a/tests/test_soupsieve.py b/tests/test_soupsieve.py index 75e03a5d..d5fca031 100644 --- a/tests/test_soupsieve.py +++ b/tests/test_soupsieve.py @@ -33,10 +33,10 @@ def test_comments(self): """ soup = bs4.BeautifulSoup(markup, 'html5lib') - comments = [str(c).strip() for c in sv.comments(soup, flags=sv.HTML5)] + comments = [str(c).strip() for c in sv.comments(soup)] self.assertEqual(sorted(comments), sorted(['before header', 'comment', "don't ignore"])) - comments = [str(c).strip() for c in sv.icomments(soup, limit=2, flags=sv.HTML5)] + comments = [str(c).strip() for c in sv.icomments(soup, limit=2)] self.assertEqual(sorted(comments), sorted(['before header', 'comment'])) def test_select(self): @@ -137,17 +137,17 @@ def test_copy_pickle(self): """Test copy and pickle.""" # Test that we can pickle and unpickle - p1 = sv.compile('p[id]', flags=sv.HTML5) + p1 = sv.compile('p[id]') sp1 = pickle.dumps(p1) pp1 = pickle.loads(sp1) self.assertTrue(pp1 == p1) # Test that we pull the same one from cache - p2 = sv.compile('p[id]', flags=sv.HTML5) + p2 = sv.compile('p[id]') self.assertTrue(p1 is p2) # Test that we compile a new one when providing a different flags - p3 = sv.compile('p[id]', flags=sv.HTML) + p3 = sv.compile('p[id]', flags=0x10) self.assertTrue(p1 is not p3) self.assertTrue(p1 != p3) @@ -183,7 +183,7 @@ def test_recompile(self): self.assertTrue(p1 is p2) with pytest.raises(ValueError): - sv.compile(p1, flags=sv.HTML) + sv.compile(p1, flags=0x10) with pytest.raises(ValueError): sv.compile(p1, namespaces={"": ""}) @@ -218,6 +218,41 @@ def test_immutable_dict(self): class TestDeprcations(unittest.TestCase): """Test Soup Sieve deprecations.""" + def test_flag_deprecations(self): + """Test flag deprecation.""" + + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + + sv.compile('p', flags=sv.HTML) + self.assertTrue(len(w) == 1) + self.assertTrue(issubclass(w[-1].category, DeprecationWarning)) + + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + + sv.compile('p', flags=sv.XHTML) + self.assertTrue(len(w) == 1) + self.assertTrue(issubclass(w[-1].category, DeprecationWarning)) + + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + + sv.compile('p', flags=sv.XML) + self.assertTrue(len(w) == 1) + self.assertTrue(issubclass(w[-1].category, DeprecationWarning)) + + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + + sv.compile('p', flags=sv.HTML5) + self.assertTrue(len(w) == 1) + self.assertTrue(issubclass(w[-1].category, DeprecationWarning)) + def test_selectiter_deprecation(self): """Test the deprecated iterator functions.""" @@ -306,12 +341,6 @@ def test_commentsiter_deprecation(self): class TestInvalid(unittest.TestCase): """Test invalid.""" - def test_invalid_mode(self): - """Test invalid mode.""" - - with self.assertRaises(ValueError): - sv.compile('p', None, sv.util.HTML | sv.util.HTML5) - def test_invalid_combination(self): """ Test invalid combination. diff --git a/tests/util.py b/tests/util.py index 14b60115..49937485 100644 --- a/tests/util.py +++ b/tests/util.py @@ -4,6 +4,11 @@ import textwrap import soupsieve as sv +HTML5 = 1 +HTML = 2 +XHTML = 4 +XML = 8 + class TestCase(unittest.TestCase): """Test case.""" @@ -11,12 +16,12 @@ class TestCase(unittest.TestCase): def assert_selector(self, markup, selectors, expected_ids, namespaces={}, flags=0): """Assert selector.""" - mode = flags & sv.util.MODE_MSK - if mode == sv.HTML: + mode = flags & 0x0F + if mode == HTML: bs_mode = 'lxml' - elif mode in (sv.HTML5, 0): + elif mode in (HTML5, 0): bs_mode = 'html5lib' - elif mode in (sv.XHTML, sv.XML): + elif mode in (XHTML, XML): bs_mode = 'xml' soup = bs4.BeautifulSoup(textwrap.dedent(markup.replace('\r\n', '\n')), bs_mode) From a72499567a0eee2b6d5594a98e04459b1dbdd5c6 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Thu, 13 Dec 2018 23:22:42 -0600 Subject: [PATCH 5/5] Bump version --- soupsieve/__meta__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/soupsieve/__meta__.py b/soupsieve/__meta__.py index 030d5802..15f43345 100644 --- a/soupsieve/__meta__.py +++ b/soupsieve/__meta__.py @@ -186,5 +186,5 @@ def parse_version(ver, pre=False): return Version(major, minor, micro, release, pre, post, dev) -__version_info__ = Version(1, 0, 0, "beta", 1) +__version_info__ = Version(1, 0, 0, "beta", 2) __version__ = __version_info__._get_canonical()