Merge pull request #47 from facelessuser/dir-selector

Add :dir() selector for HTML (WIP)
facelessuser · Dec 28, 2018 · 129dcfa · 129dcfa
2 parents 43e365b + 6a9718c
commit 129dcfa
Show file tree

Hide file tree

Showing 11 changed files with 294 additions and 18 deletions.
diff --git a/appveyor.yml b/appveyor.yml
@@ -24,6 +24,7 @@ init:
 
 install:
   # Setup Python test tools
+  - "%PYTHON%/Scripts/pip.exe install -U setuptools"
   - "%PYTHON%/Scripts/pip.exe install virtualenv"
   - "%PYTHON%/Scripts/pip.exe install tox"
   - "%PYTHON%/Scripts/pip.exe install codecov"

diff --git a/docs/src/dictionary/en-custom.txt b/docs/src/dictionary/en-custom.txt
@@ -25,6 +25,7 @@ accessor
 boolean
 builtin
 deprecations
+directionality
 html
 iterable
 iterables

diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md
@@ -1,8 +1,11 @@
 # Changelog
 
-## 1.4.1
+## 1.5.0
 
 - **NEW**: Add `select_one` method like Beautiful Soup has.
+- **NEW**: Add `:dir()` selector (HTML only).
+- **FIX**: Fix issue handling issues of HTML fragments (elements without a `BeautifulSoup` object as a parent).
+- **FIX**: Fix internal `nth` range check.
 
 ## 1.4.0
 

diff --git a/docs/src/markdown/selectors.md b/docs/src/markdown/selectors.md
@@ -81,7 +81,8 @@ Selector                        | Example                             | Descript
 `:checked`                      | `#!css input:checked`               | Selects every checked `#!html <input>` element.
 `:current`                      | `#!css p:current`                   | As the document is not rendered, this will never match.
 `:current(sel, sel)`            | `#!css :current(p, li, dt, dd)`     | As the document is not rendered, this will never match.
-`:default`                      | `#!css input:default`               | Selects all `#!html <inputs>` that are the default among their related elements. See CSS specification to learn more about all that this targets.
+`:default`                      | `#!css input:default`               | Selects all `#!html <inputs>` elements that are the default among their related elements. See CSS specification to learn more about all that this targets.
+`:dir(direction)`               | `#!css div:dir(ltr)`                | Selects all `#!html <div>` elements that have a text direction of left to right.
 `:disabled`                     | `#!css input:disabled`              | Selects every disabled `#!html <input>` element.
 `:enabled`                      | `#!css input:enabled`               | Selects every enabled `#!html <input>` element.
 `:focus`                        | `#!css input:focus`                 | Focus states are not applicable, so this will never match.

diff --git a/soupsieve/__meta__.py b/soupsieve/__meta__.py
@@ -186,5 +186,5 @@ def parse_version(ver, pre=False):
     return Version(major, minor, micro, release, pre, post, dev)
 
 
-__version_info__ = Version(1, 4, 0, "final")
+__version_info__ = Version(1, 5, 0, "final")
 __version__ = __version_info__._get_canonical()
diff --git a/soupsieve/css_match.py b/soupsieve/css_match.py
@@ -3,6 +3,7 @@
 from . import util
 import re
 from .import css_types as ct
+import unicodedata
 
 # Empty tag pattern (whitespace okay)
 RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
@@ -21,6 +22,29 @@
 
 NS_XHTML = 'http://www.w3.org/1999/xhtml'
 
+DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
+
+DIR_MAP = {
+    'ltr': ct.SEL_DIR_LTR,
+    'rtl': ct.SEL_DIR_RTL,
+    'auto': 0
+}
+
+
+class FakeNthParent(object):
+    """
+    Fake parent for `nth` selector.
+
+    When we have a fragment with no `BeautifulSoup` document object,
+    we can't evaluate `nth` selectors properly.  Create a temporary
+    fake parent so we can traverse the root element as a child.
+    """
+
+    def __init__(self, element):
+        """Initialize."""
+
+        self.contents = [element]
+
 
 class CSSMatch(object):
     """Perform CSS matching."""
@@ -38,14 +62,17 @@ def __init__(self, selectors, scope, namespaces, flags):
         while doc.parent:
             doc = doc.parent
         root = None
-        for child in doc.children:
-            if util.is_tag(child):
-                root = child
-                break
+        if not util.is_doc(doc):
+            root = doc
+        else:
+            for child in doc.children:
+                if util.is_tag(child):
+                    root = child
+                    break
         self.root = root
         self.scope = scope if scope is not doc else root
         self.html_namespace = self.is_html_ns(root)
-        self.is_xml = doc.is_xml and not self.html_namespace
+        self.is_xml = doc._is_xml and not self.html_namespace
 
     def get_namespace(self, el):
         """Get the namespace for the element."""
@@ -128,6 +155,16 @@ def get_classes(self, el):
             classes = [c for c in classes.strip().split(' ') if c]
         return classes
 
+    def get_attribute_by_name(self, el, name, default=None):
+        """Get attribute by name."""
+
+        value = default
+        for k, v in el.attrs.items():
+            if (k if self.is_xml else util.lower(k)) == name:
+                value = v
+                break
+        return value
+
     def match_namespace(self, el, tag):
         """Match the namespace of the element."""
 
@@ -316,6 +353,8 @@ def match_nth(self, el, nth):
             if n.selectors and not self.match_selectors(el, n.selectors):
                 break
             parent = el.parent
+            if parent is None:
+                parent = FakeNthParent(el)
             last = n.last
             last_index = len(parent.contents) - 1
             relative_index = 0
@@ -370,9 +409,9 @@ def match_nth(self, el, nth):
                 idx = last_idx = a * count + b if var else a
 
             # Evaluate elements while our calculated nth index is still in range
-            while 1 <= idx <= last_index:
+            while 1 <= idx <= last_index + 1:
                 child = None
-                # Evaluate while our child index is still range.
+                # Evaluate while our child index is still in range.
                 while 0 <= index <= last_index:
                     child = parent.contents[index]
                     index += factor
@@ -557,7 +596,7 @@ def match_lang(self, el, langs):
         # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
         parent = el
         found_lang = None
-        while parent.parent and not found_lang:
+        while parent and parent.parent and not found_lang:
             ns = self.is_html_ns(parent)
             for k, v in parent.attrs.items():
                 if (
@@ -623,6 +662,100 @@ def match_lang(self, el, langs):
 
         return match
 
+    def get_bidi(self, el):
+        """Get directionality from element text."""
+
+        for node in el.children:
+
+            # Analyze child text nodes
+            if util.is_tag(node):
+
+                # Avoid analyzing certain elements specified in the specification.
+                direction = DIR_MAP.get(util.lower(node.attrs.get('dir', '')), None)
+                if (
+                    util.lower(node.name) in ('bdi', 'script', 'style', 'textarea') or
+                    direction is not None
+                ):
+                    continue  # pragma: no cover
+
+                # Check directionality of this node's text
+                value = self.get_bidi(node)
+                if value is not None:
+                    return value
+
+                # Direction could not be determined
+                continue  # pragma: no cover
+
+            # Skip `doctype` comments, etc.
+            if util.is_special_string(node):
+                continue
+
+            # Analyze text nodes for directionality.
+            for c in node:
+                bidi = unicodedata.bidirectional(c)
+                if bidi in ('AL', 'R', 'L'):
+                    return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
+        return None
+
+    def match_dir(self, el, directionality):
+        """Check directionality."""
+
+        # If we have to match both left and right, we can't match either.
+        if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
+            return False
+
+        # Element has defined direction of left to right or right to left
+        direction = DIR_MAP.get(util.lower(el.attrs.get('dir', '')), None)
+        if direction not in (None, 0):
+            return direction == directionality
+
+        # Element is the document element (the root) and no direction assigned, assume left to right.
+        is_root = self.match_root(el)
+        if is_root and direction is None:
+            return ct.SEL_DIR_LTR == directionality
+
+        # If `input[type=telephone]` and no direction is assigned, assume left to right.
+        is_input = util.lower(el.name) == 'input'
+        is_textarea = util.lower(el.name) == 'textarea'
+        is_bdi = util.lower(el.name) == 'bdi'
+        itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
+        if is_input and itype == 'tel' and direction is None:
+            return ct.SEL_DIR_LTR == directionality
+
+        # Auto handling for text inputs
+        if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
+            if is_textarea:
+                value = []
+                for node in el.contents:
+                    if util.is_navigable_string(node) and not util.is_special_string(node):
+                        value.append(node)
+                value = ''.join(value)
+            else:
+                value = self.get_attribute_by_name(el, 'value', '')
+            if value:
+                for c in value:
+                    bidi = unicodedata.bidirectional(c)
+                    if bidi in ('AL', 'R', 'L'):
+                        direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
+                        return direction == directionality
+                # Assume left to right
+                return ct.SEL_DIR_LTR == directionality
+            elif is_root:
+                return ct.SEL_DIR_LTR == directionality
+            return self.match_dir(el.parent, directionality)
+
+        # Auto handling for `bdi` and other non text inputs.
+        if (is_bdi and direction is None) or direction == 0:
+            direction = self.get_bidi(el)
+            if direction is not None:
+                return direction == directionality
+            elif is_root:
+                return ct.SEL_DIR_LTR == directionality
+            return self.match_dir(el.parent, directionality)
+
+        # Match parents direction
+        return self.match_dir(el.parent, directionality)
+
     def match_selectors(self, el, selectors):
         """Check if element matches one of the selectors."""
 
@@ -674,6 +807,9 @@ def match_selectors(self, el, selectors):
                 # also not set.
                 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
                     continue
+                # Validate element directionality
+                if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
+                    continue
                 # Validate that the tag contains the specified text.
                 if not self.match_contains(el, selector.contains):
                     continue
@@ -691,7 +827,7 @@ def is_html_ns(self, el):
     def match(self, el):
         """Match."""
 
-        return el.parent and self.match_selectors(el, self.selectors)
+        return not util.is_doc(el) and util.is_tag(el) and self.match_selectors(el, self.selectors)
 
 
 class SoupSieve(ct.Immutable):

diff --git a/soupsieve/css_parser.py b/soupsieve/css_parser.py
@@ -69,6 +69,7 @@
 
 # Complex pseudo classes that take very specific parameters and are handled special
 PSEUDO_SPECIAL = {
+    ':dir',
     ':lang',
     ':nth-child',
     ':nth-last-child',
@@ -127,6 +128,8 @@
 '''.format(ws=WSC, nth=NTH)
 # Pseudo class language (`:lang("*-de", en)`)
 PAT_PSEUDO_LANG = r':lang\({ws}*(?P<lang>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(ws=WSC, value=VALUE)
+# Pseudo class direction (`:dir(ltr)`)
+PAT_PSEUDO_DIR = r':dir\({ws}*(?P<dir>ltr|rtl){ws}*\)'.format(ws=WSC)
 # Combining characters (`>`, `~`, ` `, `+`, `,`)
 PAT_COMBINE = r'{ws}*?(?P<relation>[,+>~]|[ \t\r\n\f](?![,+>~])){ws}*'.format(ws=WSC)
 # Extra: Contains (`:contains(text)`)
@@ -285,6 +288,7 @@ class CSSParser(object):
             ("pseudo_nth_child", SelectorPattern(PAT_PSEUDO_NTH_CHILD)),
             ("pseudo_nth_type", SelectorPattern(PAT_PSEUDO_NTH_TYPE)),
             ("pseudo_lang", SelectorPattern(PAT_PSEUDO_LANG)),
+            ("pseudo_dir", SelectorPattern(PAT_PSEUDO_DIR)),
             ("pseudo_class", SelectorPattern(PAT_PSEUDO_CLASS)),
             ("pseudo_element", SelectorPattern(PAT_PSEUDO_ELEMENT)),
             ("at_rule", SelectorPattern(PAT_AT_RULE)),
@@ -631,6 +635,14 @@ def parse_pseudo_lang(self, sel, m, has_selector):
 
         return has_selector
 
+    def parse_pseudo_dir(self, sel, m, has_selector):
+        """Parse pseudo direction."""
+
+        value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL
+        sel.flags |= value
+        has_selector = True
+        return has_selector
+
     def parse_selectors(self, iselector, index=0, flags=0):
         """Parse selectors."""
 
@@ -685,6 +697,10 @@ def parse_selectors(self, iselector, index=0, flags=0):
                     has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector)
                 elif key == 'pseudo_lang':
                     has_selector = self.parse_pseudo_lang(sel, m, has_selector)
+                elif key == 'pseudo_dir':
+                    has_selector = self.parse_pseudo_dir(sel, m, has_selector)
+                    # Currently only supports HTML
+                    is_html = True
                 elif key == 'pseudo_close':
                     if split_last:
                         raise SyntaxError("Expecting more selectors at postion {}".format(m.start(0)))

diff --git a/soupsieve/css_types.py b/soupsieve/css_types.py
@@ -11,6 +11,8 @@
 SEL_DEFAULT = 0x4
 SEL_INDETERMINATE = 0x8
 SEL_SCOPE = 0x10
+SEL_DIR_LTR = 0x20
+SEL_DIR_RTL = 0x40
 
 
 class Immutable(object):

diff --git a/soupsieve/util.py b/soupsieve/util.py
@@ -34,6 +34,13 @@
 UC_Z = ord('Z')
 
 
+def is_doc(obj):
+    """Is `BeautifulSoup` object."""
+
+    import bs4
+    return isinstance(obj, bs4.BeautifulSoup)
+
+
 def is_tag(obj):
     """Is tag."""
 

diff --git a/tests/test_level3.py b/tests/test_level3.py
@@ -27,6 +27,8 @@
 """
 from __future__ import unicode_literals
 from . import util
+import bs4
+import soupsieve as sv
 
 
 class TestLevel3(util.TestCase):
@@ -763,6 +765,12 @@ def test_nth_child(self):
             flags=util.HTML5
         )
 
+        # Paragraph is the root. There is no document.
+        markup = """<p id="1">text</p>"""
+        soup = bs4.BeautifulSoup(markup, 'html5lib')
+        fragment = soup.p.extract()
+        self.assertTrue(sv.match("p:nth-child(1)", fragment, flags=sv.DEBUG))
+
     def test_nth_last_child(self):
         """Test `nth` last child."""