Skip to content

Commit

Permalink
Merge pull request #47 from facelessuser/dir-selector
Browse files Browse the repository at this point in the history
Add :dir() selector for HTML (WIP)
  • Loading branch information
facelessuser authored Dec 28, 2018
2 parents 43e365b + 6a9718c commit 129dcfa
Show file tree
Hide file tree
Showing 11 changed files with 294 additions and 18 deletions.
1 change: 1 addition & 0 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ init:

install:
# Setup Python test tools
- "%PYTHON%/Scripts/pip.exe install -U setuptools"
- "%PYTHON%/Scripts/pip.exe install virtualenv"
- "%PYTHON%/Scripts/pip.exe install tox"
- "%PYTHON%/Scripts/pip.exe install codecov"
Expand Down
1 change: 1 addition & 0 deletions docs/src/dictionary/en-custom.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ accessor
boolean
builtin
deprecations
directionality
html
iterable
iterables
Expand Down
5 changes: 4 additions & 1 deletion docs/src/markdown/about/changelog.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# Changelog

## 1.4.1
## 1.5.0

- **NEW**: Add `select_one` method like Beautiful Soup has.
- **NEW**: Add `:dir()` selector (HTML only).
- **FIX**: Fix issue handling issues of HTML fragments (elements without a `BeautifulSoup` object as a parent).
- **FIX**: Fix internal `nth` range check.

## 1.4.0

Expand Down
3 changes: 2 additions & 1 deletion docs/src/markdown/selectors.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ Selector | Example | Descript
`:checked` | `#!css input:checked` | Selects every checked `#!html <input>` element.
`:current` | `#!css p:current` | As the document is not rendered, this will never match.
`:current(sel, sel)` | `#!css :current(p, li, dt, dd)` | As the document is not rendered, this will never match.
`:default` | `#!css input:default` | Selects all `#!html <inputs>` that are the default among their related elements. See CSS specification to learn more about all that this targets.
`:default` | `#!css input:default` | Selects all `#!html <inputs>` elements that are the default among their related elements. See CSS specification to learn more about all that this targets.
`:dir(direction)` | `#!css div:dir(ltr)` | Selects all `#!html <div>` elements that have a text direction of left to right.
`:disabled` | `#!css input:disabled` | Selects every disabled `#!html <input>` element.
`:enabled` | `#!css input:enabled` | Selects every enabled `#!html <input>` element.
`:focus` | `#!css input:focus` | Focus states are not applicable, so this will never match.
Expand Down
2 changes: 1 addition & 1 deletion soupsieve/__meta__.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,5 +186,5 @@ def parse_version(ver, pre=False):
return Version(major, minor, micro, release, pre, post, dev)


__version_info__ = Version(1, 4, 0, "final")
__version_info__ = Version(1, 5, 0, "final")
__version__ = __version_info__._get_canonical()
154 changes: 145 additions & 9 deletions soupsieve/css_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from . import util
import re
from .import css_types as ct
import unicodedata

# Empty tag pattern (whitespace okay)
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
Expand All @@ -21,6 +22,29 @@

NS_XHTML = 'http://www.w3.org/1999/xhtml'

DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL

DIR_MAP = {
'ltr': ct.SEL_DIR_LTR,
'rtl': ct.SEL_DIR_RTL,
'auto': 0
}


class FakeNthParent(object):
"""
Fake parent for `nth` selector.
When we have a fragment with no `BeautifulSoup` document object,
we can't evaluate `nth` selectors properly. Create a temporary
fake parent so we can traverse the root element as a child.
"""

def __init__(self, element):
"""Initialize."""

self.contents = [element]


class CSSMatch(object):
"""Perform CSS matching."""
Expand All @@ -38,14 +62,17 @@ def __init__(self, selectors, scope, namespaces, flags):
while doc.parent:
doc = doc.parent
root = None
for child in doc.children:
if util.is_tag(child):
root = child
break
if not util.is_doc(doc):
root = doc
else:
for child in doc.children:
if util.is_tag(child):
root = child
break
self.root = root
self.scope = scope if scope is not doc else root
self.html_namespace = self.is_html_ns(root)
self.is_xml = doc.is_xml and not self.html_namespace
self.is_xml = doc._is_xml and not self.html_namespace

def get_namespace(self, el):
"""Get the namespace for the element."""
Expand Down Expand Up @@ -128,6 +155,16 @@ def get_classes(self, el):
classes = [c for c in classes.strip().split(' ') if c]
return classes

def get_attribute_by_name(self, el, name, default=None):
"""Get attribute by name."""

value = default
for k, v in el.attrs.items():
if (k if self.is_xml else util.lower(k)) == name:
value = v
break
return value

def match_namespace(self, el, tag):
"""Match the namespace of the element."""

Expand Down Expand Up @@ -316,6 +353,8 @@ def match_nth(self, el, nth):
if n.selectors and not self.match_selectors(el, n.selectors):
break
parent = el.parent
if parent is None:
parent = FakeNthParent(el)
last = n.last
last_index = len(parent.contents) - 1
relative_index = 0
Expand Down Expand Up @@ -370,9 +409,9 @@ def match_nth(self, el, nth):
idx = last_idx = a * count + b if var else a

# Evaluate elements while our calculated nth index is still in range
while 1 <= idx <= last_index:
while 1 <= idx <= last_index + 1:
child = None
# Evaluate while our child index is still range.
# Evaluate while our child index is still in range.
while 0 <= index <= last_index:
child = parent.contents[index]
index += factor
Expand Down Expand Up @@ -557,7 +596,7 @@ def match_lang(self, el, langs):
# Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
parent = el
found_lang = None
while parent.parent and not found_lang:
while parent and parent.parent and not found_lang:
ns = self.is_html_ns(parent)
for k, v in parent.attrs.items():
if (
Expand Down Expand Up @@ -623,6 +662,100 @@ def match_lang(self, el, langs):

return match

def get_bidi(self, el):
"""Get directionality from element text."""

for node in el.children:

# Analyze child text nodes
if util.is_tag(node):

# Avoid analyzing certain elements specified in the specification.
direction = DIR_MAP.get(util.lower(node.attrs.get('dir', '')), None)
if (
util.lower(node.name) in ('bdi', 'script', 'style', 'textarea') or
direction is not None
):
continue # pragma: no cover

# Check directionality of this node's text
value = self.get_bidi(node)
if value is not None:
return value

# Direction could not be determined
continue # pragma: no cover

# Skip `doctype` comments, etc.
if util.is_special_string(node):
continue

# Analyze text nodes for directionality.
for c in node:
bidi = unicodedata.bidirectional(c)
if bidi in ('AL', 'R', 'L'):
return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
return None

def match_dir(self, el, directionality):
"""Check directionality."""

# If we have to match both left and right, we can't match either.
if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
return False

# Element has defined direction of left to right or right to left
direction = DIR_MAP.get(util.lower(el.attrs.get('dir', '')), None)
if direction not in (None, 0):
return direction == directionality

# Element is the document element (the root) and no direction assigned, assume left to right.
is_root = self.match_root(el)
if is_root and direction is None:
return ct.SEL_DIR_LTR == directionality

# If `input[type=telephone]` and no direction is assigned, assume left to right.
is_input = util.lower(el.name) == 'input'
is_textarea = util.lower(el.name) == 'textarea'
is_bdi = util.lower(el.name) == 'bdi'
itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
if is_input and itype == 'tel' and direction is None:
return ct.SEL_DIR_LTR == directionality

# Auto handling for text inputs
if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
if is_textarea:
value = []
for node in el.contents:
if util.is_navigable_string(node) and not util.is_special_string(node):
value.append(node)
value = ''.join(value)
else:
value = self.get_attribute_by_name(el, 'value', '')
if value:
for c in value:
bidi = unicodedata.bidirectional(c)
if bidi in ('AL', 'R', 'L'):
direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
return direction == directionality
# Assume left to right
return ct.SEL_DIR_LTR == directionality
elif is_root:
return ct.SEL_DIR_LTR == directionality
return self.match_dir(el.parent, directionality)

# Auto handling for `bdi` and other non text inputs.
if (is_bdi and direction is None) or direction == 0:
direction = self.get_bidi(el)
if direction is not None:
return direction == directionality
elif is_root:
return ct.SEL_DIR_LTR == directionality
return self.match_dir(el.parent, directionality)

# Match parents direction
return self.match_dir(el.parent, directionality)

def match_selectors(self, el, selectors):
"""Check if element matches one of the selectors."""

Expand Down Expand Up @@ -674,6 +807,9 @@ def match_selectors(self, el, selectors):
# also not set.
if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
continue
# Validate element directionality
if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
continue
# Validate that the tag contains the specified text.
if not self.match_contains(el, selector.contains):
continue
Expand All @@ -691,7 +827,7 @@ def is_html_ns(self, el):
def match(self, el):
"""Match."""

return el.parent and self.match_selectors(el, self.selectors)
return not util.is_doc(el) and util.is_tag(el) and self.match_selectors(el, self.selectors)


class SoupSieve(ct.Immutable):
Expand Down
16 changes: 16 additions & 0 deletions soupsieve/css_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@

# Complex pseudo classes that take very specific parameters and are handled special
PSEUDO_SPECIAL = {
':dir',
':lang',
':nth-child',
':nth-last-child',
Expand Down Expand Up @@ -127,6 +128,8 @@
'''.format(ws=WSC, nth=NTH)
# Pseudo class language (`:lang("*-de", en)`)
PAT_PSEUDO_LANG = r':lang\({ws}*(?P<lang>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(ws=WSC, value=VALUE)
# Pseudo class direction (`:dir(ltr)`)
PAT_PSEUDO_DIR = r':dir\({ws}*(?P<dir>ltr|rtl){ws}*\)'.format(ws=WSC)
# Combining characters (`>`, `~`, ` `, `+`, `,`)
PAT_COMBINE = r'{ws}*?(?P<relation>[,+>~]|[ \t\r\n\f](?![,+>~])){ws}*'.format(ws=WSC)
# Extra: Contains (`:contains(text)`)
Expand Down Expand Up @@ -285,6 +288,7 @@ class CSSParser(object):
("pseudo_nth_child", SelectorPattern(PAT_PSEUDO_NTH_CHILD)),
("pseudo_nth_type", SelectorPattern(PAT_PSEUDO_NTH_TYPE)),
("pseudo_lang", SelectorPattern(PAT_PSEUDO_LANG)),
("pseudo_dir", SelectorPattern(PAT_PSEUDO_DIR)),
("pseudo_class", SelectorPattern(PAT_PSEUDO_CLASS)),
("pseudo_element", SelectorPattern(PAT_PSEUDO_ELEMENT)),
("at_rule", SelectorPattern(PAT_AT_RULE)),
Expand Down Expand Up @@ -631,6 +635,14 @@ def parse_pseudo_lang(self, sel, m, has_selector):

return has_selector

def parse_pseudo_dir(self, sel, m, has_selector):
"""Parse pseudo direction."""

value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL
sel.flags |= value
has_selector = True
return has_selector

def parse_selectors(self, iselector, index=0, flags=0):
"""Parse selectors."""

Expand Down Expand Up @@ -685,6 +697,10 @@ def parse_selectors(self, iselector, index=0, flags=0):
has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector)
elif key == 'pseudo_lang':
has_selector = self.parse_pseudo_lang(sel, m, has_selector)
elif key == 'pseudo_dir':
has_selector = self.parse_pseudo_dir(sel, m, has_selector)
# Currently only supports HTML
is_html = True
elif key == 'pseudo_close':
if split_last:
raise SyntaxError("Expecting more selectors at postion {}".format(m.start(0)))
Expand Down
2 changes: 2 additions & 0 deletions soupsieve/css_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
SEL_DEFAULT = 0x4
SEL_INDETERMINATE = 0x8
SEL_SCOPE = 0x10
SEL_DIR_LTR = 0x20
SEL_DIR_RTL = 0x40


class Immutable(object):
Expand Down
7 changes: 7 additions & 0 deletions soupsieve/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@
UC_Z = ord('Z')


def is_doc(obj):
"""Is `BeautifulSoup` object."""

import bs4
return isinstance(obj, bs4.BeautifulSoup)


def is_tag(obj):
"""Is tag."""

Expand Down
8 changes: 8 additions & 0 deletions tests/test_level3.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
"""
from __future__ import unicode_literals
from . import util
import bs4
import soupsieve as sv


class TestLevel3(util.TestCase):
Expand Down Expand Up @@ -763,6 +765,12 @@ def test_nth_child(self):
flags=util.HTML5
)

# Paragraph is the root. There is no document.
markup = """<p id="1">text</p>"""
soup = bs4.BeautifulSoup(markup, 'html5lib')
fragment = soup.p.extract()
self.assertTrue(sv.match("p:nth-child(1)", fragment, flags=sv.DEBUG))

def test_nth_last_child(self):
"""Test `nth` last child."""

Expand Down
Loading

0 comments on commit 129dcfa

Please sign in to comment.