Moved wiki to mkdocs (#33)

trag1c · May 28, 2024 · c738f16 · c738f16
1 parent 55358d0
commit c738f16
Show file tree

Hide file tree

Showing 7 changed files with 314 additions and 4 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2022–2023 trag1c
+Copyright (c) 2022–2024 trag1c
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -46,7 +46,7 @@ def hex2rgb(hex_color: str) -> tuple[int, int, int]:
 t = Crossandra(
     ignore_whitespace=True,
     rules=[
-        Rule(r"#[0-9a-fA-F]+", hex2rgb),
+        Rule(r"#[0-9a-fA-F]{6}", hex2rgb),
         common.WORD
     ]
 )
@@ -74,7 +74,7 @@ class Op(Enum):
 sm = Crossandra(
     Op,
     ignore_whitespace=True,
-    rules=[Rule(r"(?:\\|/)+", sm_int)]
+    rules=[Rule(r"[\\/]+", sm_int)]
 )
 
 print(*sm.tokenize(r"//\ ++ /\\/ --- /\/\/ - ///"))

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,4 +1,6 @@
 mypy ~= 1.10.0
 pytest ~= 8.2.0
 pytest-cov ~= 5.0.0
-ruff ~= 0.4.2
+ruff ~= 0.4.2
+mkdocs ~= 1.6.0
+mkdocs-material ~= 9.5.25
diff --git a/docs/examples.md b/docs/examples.md
@@ -0,0 +1,70 @@
+## Brainfuck
+```py
+from enum import Enum
+from crossandra import Crossandra
+
+class Brainfuck(Enum):
+    ADD = "+"
+    SUB = "-"
+    LEFT = "<"
+    RIGHT = ">"
+    READ = ","
+    WRITE = "."
+    BEGIN_LOOP = "["
+    END_LOOP = "]"
+
+bf = Crossandra(Brainfuck, suppress_unknown=True)
+print(*bf.tokenize("cat program: ,[.,]"), sep="\n")
+# Brainfuck.READ
+# Brainfuck.BEGIN_LOOP
+# Brainfuck.WRITE
+# Brainfuck.READ
+# Brainfuck.END_LOOP
+```
+## Word tokenization with HEX2RGB conversion
+```py
+from crossandra import Crossandra, Rule, common
+
+def hex2rgb(hex_color: str) -> tuple[int, int, int]:
+    r, g, b = (int(hex_color[i:i+2], 16) for i in range(1, 6, 2))
+    return r, g, b
+
+t = Crossandra(
+    ignore_whitespace=True,
+    rules=[
+        Rule(r"#[0-9a-fA-F]{6}", hex2rgb),
+        common.WORD
+    ]
+)
+
+text = "My favorite color is #facade"
+print(t.tokenize(text))
+# ['My', 'favorite', 'color', 'is', (250, 202, 222)]
+```
+## Supporting [Samarium]'s numbers and arithmetic operators
+```py
+from enum import Enum
+from crossandra import Crossandra, Rule
+
+def sm_int(string: str) -> int:
+    return int(string.replace("/", "1").replace("\\", "0"), 2)
+
+class Op(Enum):
+    ADD = "+"
+    SUB = "-"
+    MUL = "++"
+    DIV = "--"
+    POW = "+++"
+    MOD = "---"
+
+sm = Crossandra(
+    Op,
+    ignore_whitespace=True,
+    rules=[Rule(r"[\\/]+", sm_int)]
+)
+
+print(*sm.tokenize(r"//\ ++ /\\/ --- /\/\/ - ///"))
+# 6 Op.MUL 9 Op.MOD 21 Op.SUB 7
+```
+
+[Samarium]: https://github.com/samarium-lang/samarium
diff --git a/docs/index.md b/docs/index.md
@@ -0,0 +1,41 @@
+# Crossandra
+Crossandra is a fast and simple tokenization library for Python operating on
+enums and regular expressions, with a decent amount of configuration.
+
+## Installation
+Crossandra is available on PyPI and can be installed with pip, or any other
+Python package manager:
+```console
+$ pip install crossandra
+```
+(Some systems may require you to use `pip3`, `python -m pip`, or `py -m pip`
+instead)
+
+## Contributing
+
+Contributions are welcome!
+
+Please open an issue before submitting a pull request (unless it's a minor
+change like fixing a typo).
+
+To get started:
+
+1. Clone your fork of the project.
+2. Set up the project with `just install` (uses [uv]).
+3. After you're done, run `just check` to check your changes.
+
+!!! note
+    If you don't want to use [`just`][just], simply look up the recipes
+    in the project's [`justfile`][justfile].
+
+## License
+Crossandra is licensed under the MIT License.
+
+If you have any questions, or would like to get in touch, join my
+[Discord server]!
+
+[Documentation]: https://github.com/trag1c/crossandra/wiki/The-Crossandra-class
+[Discord server]: https://discord.gg/C8QE5tVQEq
+[just]: https://github.com/casey/just
+[justfile]: https://github.com/trag1c/crossandra/blob/main/justfile
+[uv]: https://github.com/astral-sh/uv
diff --git a/docs/reference.md b/docs/reference.md
@@ -0,0 +1,167 @@
+## `Crossandra`
+```py
+class Crossandra(
+    token_source: type[Enum] = Empty,
+    *,
+    convert_crlf: bool = True,
+    ignore_whitespace: bool = False,
+    ignored_characters: str = "",
+    rules: list[Rule[Any] | RuleGroup] | None = None,
+    suppress_unknown: bool = False,
+)
+```
+The core class representing a `Crossandra` tokenizer. Takes the following
+arguments:
+
+* `token_source`: an enum containing all possible tokens (defaults to an empty
+  enum)
+* `convert_crlf`: whether `\r\n` should be converted to `\n` before tokenization
+* `ignored_characters`: a string of characters to ignore (defaults to `""`)
+* `ignore_whitespace`: whether spaces, tabs, newlines etc. should be ignored
+  (defaults to `False`)
+* `suppress_unknown`: whether unknown-token errors should be suppressed
+  (defaults to `False`)
+* `rules`: a list of additional rules to use
+
+The enum takes priority over the rule list.  
+The rules are prioritized in the order they appear in the list (descending).
+
+Token enums can allow a tuple of values as aliases:
+```py
+class MarkdownStyle(Enum):
+    BOLD = "**"
+    ITALIC = ("_", "*")
+    UNDERLINE = "__"
+    STRIKETHROUGH = "~~"
+    CODE = ("`", "``")
+
+
+print(
+    *Crossandra(MarkdownStyle, ignore_whitespace=True).tokenize("* ** _ __"),
+    sep="\n"
+)
+# <MarkdownStyle.ITALIC: ('*', '_')>
+# <MarkdownStyle.BOLD: '**'>
+# <MarkdownStyle.ITALIC: ('*', '_')>
+# <MarkdownStyle.UNDERLINE: '__'>
+```
+
+### `Crossandra.tokenize`
+```py
+def tokenize(self, code: str) -> list[Enum | Any]
+```
+Tokenizes the input string. Returns a list of tokens.
+
+### `Crossandra.tokenize_lines`
+```py
+def tokenize_lines(self, code: str) -> list[list[Enum | Any]]
+```
+Tokenizes the input string line by line. Returns a nested list of tokens, where
+each inner list corresponds to a consecutive line of the input string.
+Equivalent to `[foo.tokenize(line) for line in source.splitlines()]`.
+
+### Fast Mode
+When all tokens are of length 1 and there are no additional rules, Crossandra
+will use a simpler tokenization method (the so called Fast Mode).
+
+!!! example
+    Tokenizing noisy Brainfuck code (`BrainfuckToken` taken from
+    [examples](examples.md#brainfuck))
+
+    *(tested on MacBook Air M1 (256/16) with pure Python wheels)*
+    ```py
+    # Setup
+    from random import choices
+    from string import punctuation
+
+    program = "".join(choices(punctuation, k=...))
+    tokenizer = Crossandra(Brainfuck, suppress_unknown=True)
+    ```
+
+    log10(k) | Default | Fast Mode | Speedup
+    ---      | ---:    | ---:      | ---:
+    1        | 40µs    | 20µs      | 100%
+    2        | 160µs   | 30µs      | 433%
+    3        | 1.5ms   | 130µs     | 1,054%
+    4        | 14ms    | 900µs     | 1,456%
+    5        | 290ms   | 9ms       | 3,122%
+
+
+## Rules and rule groups
+
+### `Rule`
+```py
+class Rule[T](
+    pattern: Pattern[str] | str,
+    converter: Callable[[str], T] | None = None,
+    *,
+    flags: RegexFlag | int = 0,
+    ignore: bool = False,
+)
+```
+Used for defining custom rules. `pattern` is a regex pattern to match (`flags`
+can be supplied). A `converter` can be supplied and will be called with the
+matched substring as the argument (defaults to `None`, returning the matched
+string directly). When `ignore` is `True`, the matched substring will be
+excluded from the output.
+
+`Rule` objects are hashable and comparable and can be ORed (`|`) for grouping
+with other `Rule`s and `RuleGroup`s.
+
+#### `Rule.apply`
+```py
+def apply(self, target: str) -> tuple[T | str | Ignored, int] | NotApplied
+```
+Checks if `target` matches the Rule's pattern. If it does, returns a tuple with
+
+* if `ignore=True`: the `Ignored` sentinel
+* if `converter=None`: the matched substring
+* otherwise: the result of calling the Rule's converter on the matched substring
+
+and the length of the matched substring. If it doesn't, returns the `NotApplied`
+sentinel.
+
+### `RuleGroup`
+```py
+class RuleGroup(rules: tuple[Rule[Any], ...])
+```
+Used for storing multiple Rules in one object. `RuleGroup`s can be constructed
+by passing in a tuple of rules or by ORing (`|`) two or more `Rule`s, and they
+can be ORed with other `RuleGroup`s or `Rule`s themselves. `RuleGroup`s are
+hashable and iterable.
+
+#### `RuleGroup.apply`
+```py
+def apply(self, target: str) -> tuple[Any | str | Ignored, int] | NotApplied
+```
+Applies the rules in the group to the target string. Returns the result of the
+first rule that matches, or `NotApplied` if none do.
+
+
+## Common patterns
+
+The `common` submodule is a collection of commonly used patterns.
+
+### Rules
+* CHAR (e.g. `'h'`)
+* LETTER (e.g. `m`)
+* WORD (e.g. `ball`)
+* SINGLE_QUOTED_STRING (e.g. `'nice fish'`)
+* DOUBLE_QUOTED_STRING (e.g. `"hello there"`)
+* C_NAME (e.g. `crossandra_rocks`)
+* NEWLINE (`\r\n` or `\n`)
+* DIGIT (e.g. `7`)
+* HEXDIGIT (e.g. `c`)
+* DECIMAL (e.g. `3.14`)
+* INT (e.g. `2137`)
+* SIGNED_INT (e.g. `-1`)
+* FLOAT (e.g. `1e3`)
+* SIGNED_FLOAT (e.g. `+4.3`)
+
+### Rule groups
+* STRING (`SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING`)
+* NUMBER (`INT | FLOAT`)
+* SIGNED_NUMBER (`SIGNED_INT | SIGNED_FLOAT`)
+* ANY_INT (`INT | SIGNED_INT`)
+* ANY_FLOAT (`FLOAT | SIGNED_FLOAT`)
+* ANY_NUMBER (`NUMBER | SIGNED_NUMBER`)
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -0,0 +1,30 @@
+site_name: Crossandra
+repo_url: https://github.com/trag1c/crossandra
+repo_name: trag1c/crossandra
+copyright: Copyright &copy; 2022–2024 trag1c
+nav:
+  - Home: index.md
+  - Examples: examples.md
+  - Reference: reference.md
+theme:
+  name: material
+  palette:
+    - media: "(prefers-color-scheme: light)"
+      scheme: default
+      primary: deep orange
+      accent: deep orange
+      toggle:
+        icon: material/toggle-switch-off-outline
+        name: Enable dark mode
+
+    - media: "(prefers-color-scheme: dark)"
+      scheme: slate
+      primary: deep orange
+      accent: deep orange
+      toggle:
+        icon: material/toggle-switch
+        name: Disable dark mode
+
+markdown_extensions:
+  - admonition
+  - pymdownx.superfences