diff --git a/Cargo.lock b/Cargo.lock index 3112e45..426d6d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,4 +4,57 @@ version = 3 [[package]] name = "gregex" -version = "0.6.0" +version = "0.7.0" +dependencies = [ + "gregex-logic", + "gregex-macros", +] + +[[package]] +name = "gregex-logic" +version = "0.1.0" + +[[package]] +name = "gregex-macros" +version = "0.1.0" +dependencies = [ + "gregex-logic", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" diff --git a/Cargo.toml b/Cargo.toml index 6c70e12..5eda481 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gregex" -version = "0.6.0" +version = "0.7.0" edition = "2021" authors = ["Saphereye "] license = "MIT" @@ -19,4 +19,12 @@ repository = "https://github.com/Saphereye/gregex" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[workspace] +members = [ + "gregex-macros", + "gregex-logic", +] + [dependencies] +gregex-macros = { path = "gregex-macros" } +gregex-logic = { path = "gregex-logic" } diff --git a/README.md b/README.md index 9737079..b86c473 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,5 @@ # Gregex ![crates.io](https://img.shields.io/crates/v/gregex.svg) ![Build Passing](https://github.com/Saphereye/gregex/actions/workflows/ci.yml/badge.svg) -Gregex is a regular expression solver which utilizes Non-deterministic Finite Automata (NFA) to simulate the input strings. +![](https://github.com/Saphereye/gregex/raw/master/assets/gregex_workflow.excalidraw.svg) -## Usage - -```rust -extern crate gregex; -use gregex::*; -fn main() { - let tree = dot!(star!('a'), 'b', 'c'); - let regex = regex(&tree); - assert!(regex.run("abc")); - assert!(!regex.run("a")); - assert!(regex.run("aaabc")); -} -``` - -## Theory -The project uses [Glushkov's construction algorithm](https://en.wikipedia.org/wiki/Glushkov%27s_construction_algorithm) for creating the NFA. - -The pipeline can be summarised as below -![](https://github.com/Saphereye/gregex/blob/master/assets/gregex_workflow.excalidraw.svg) \ No newline at end of file +Gregex is a regular expression solver which utilizes Non-deterministic Finite Automata (NFA) to simulate the input strings. \ No newline at end of file diff --git a/examples/dot.rs b/examples/dot.rs new file mode 100644 index 0000000..2f7d9b4 --- /dev/null +++ b/examples/dot.rs @@ -0,0 +1,9 @@ +extern crate gregex; +use gregex::*; + +fn main() { + let runner = regex!(dot!('a', 'b', 'c')); + assert_eq!(runner.run("abc"), true); + assert_eq!(runner.run("ab"), false); + assert_eq!(runner.run("abcd"), false); +} diff --git a/examples/or.rs b/examples/or.rs new file mode 100644 index 0000000..4353a3f --- /dev/null +++ b/examples/or.rs @@ -0,0 +1,9 @@ +extern crate gregex; +use gregex::*; + +fn main() { + let runner = regex!(or!('a', 'b', 'c')); + assert_eq!(runner.run("a"), true); + assert_eq!(runner.run("b"), true); + assert_eq!(runner.run("c"), true); +} diff --git a/examples/star.rs b/examples/star.rs new file mode 100644 index 0000000..e97f2a4 --- /dev/null +++ b/examples/star.rs @@ -0,0 +1,9 @@ +extern crate gregex; +use gregex::*; + +fn main() { + let runner = regex!(star!('a')); + assert_eq!(runner.run("a"), true); + assert_eq!(runner.run("aa"), true); + assert_eq!(runner.run(""), true); +} diff --git a/gregex-logic/Cargo.toml b/gregex-logic/Cargo.toml new file mode 100644 index 0000000..661fcc0 --- /dev/null +++ b/gregex-logic/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "gregex-logic" +version = "0.1.0" +edition = "2021" + +[dependencies] diff --git a/gregex-logic/README.md b/gregex-logic/README.md new file mode 100644 index 0000000..2d29383 --- /dev/null +++ b/gregex-logic/README.md @@ -0,0 +1,6 @@ +# Gregex Logic +Contains the underlying logic of the Gregex crate. This crate is responsible for converting the Node tree to the NFA. The NFA is then used to match the input string. + +The crate uses the [Glushkov's Construction Algorithm](https://en.wikipedia.org/wiki/Glushkov%27s_construction_algorithm) to convert the Node tree to the NFA. The advantage over the Thompson's Construction Algorithm is that the NFA generated has states equal to number of terminals + 1. Although, the NFA generated by Thumpson's can be converted to the Glushkov's form, by removing the epsilon transitions. + +The `translation` module contains the code to convert the Node tree to the NFA. The `nfa` module contains the code to match the input string with the NFA. \ No newline at end of file diff --git a/gregex-logic/src/lib.rs b/gregex-logic/src/lib.rs new file mode 100644 index 0000000..d41c5f2 --- /dev/null +++ b/gregex-logic/src/lib.rs @@ -0,0 +1,7 @@ +#[doc = include_str!("../README.md")] +#[cfg(not(doctest))] +pub mod nfa; +pub mod translation; + +use std::sync::atomic::AtomicU32; +pub static TERMINAL_COUNT: AtomicU32 = AtomicU32::new(0); diff --git a/src/nfa.rs b/gregex-logic/src/nfa.rs similarity index 100% rename from src/nfa.rs rename to gregex-logic/src/nfa.rs diff --git a/src/translation/mod.rs b/gregex-logic/src/translation/mod.rs similarity index 100% rename from src/translation/mod.rs rename to gregex-logic/src/translation/mod.rs diff --git a/src/translation/node.rs b/gregex-logic/src/translation/node.rs similarity index 100% rename from src/translation/node.rs rename to gregex-logic/src/translation/node.rs diff --git a/src/translation/operator.rs b/gregex-logic/src/translation/operator.rs similarity index 100% rename from src/translation/operator.rs rename to gregex-logic/src/translation/operator.rs diff --git a/src/translation/setterminal.rs b/gregex-logic/src/translation/setterminal.rs similarity index 100% rename from src/translation/setterminal.rs rename to gregex-logic/src/translation/setterminal.rs diff --git a/gregex-macros/Cargo.toml b/gregex-macros/Cargo.toml new file mode 100644 index 0000000..c0b3abe --- /dev/null +++ b/gregex-macros/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "gregex-macros" +version = "0.1.0" +edition = "2021" + +[dependencies] +gregex-logic = { path = "../gregex-logic" } +syn = { version = "1.0", features = ["full"] } +quote = "1.0" +proc-macro2 = "1.0" + +[lib] +proc-macro = true \ No newline at end of file diff --git a/gregex-macros/README.md b/gregex-macros/README.md new file mode 100644 index 0000000..3c37ea7 --- /dev/null +++ b/gregex-macros/README.md @@ -0,0 +1,19 @@ +# Gregex Macros +Contains the macro interface for all the gregex function. + +Without these, users would have to rely on function that generate the Node tree. To explain this we can first look at an example. + +Let's take the regex `a*`. + +The Node tree in our case would be, +```rust +Node::Operation( + Operator::Production, + Box::new(Node::Terminal('a', 0u32)), + None, +) +``` + +Although we can wrap this in a function or a `macro_rules!` macro, the generated code is quite bloated. We can do the hard work during compilation, i.e. converting our regex to the end NFA. + +Currently converting to NFA is not possible, but this crate can convert it to the interstitial form of the Node Tree. \ No newline at end of file diff --git a/gregex-macros/src/lib.rs b/gregex-macros/src/lib.rs new file mode 100644 index 0000000..59e8212 --- /dev/null +++ b/gregex-macros/src/lib.rs @@ -0,0 +1,173 @@ +#[doc = include_str!("../README.md")] +#[cfg(not(doctest))] +extern crate proc_macro; + +use proc_macro::TokenStream; +use quote::quote; +use syn::{parse_macro_input, Expr, ExprLit, ExprMacro, Lit}; + +#[proc_macro] +pub fn dot(input: TokenStream) -> TokenStream { + let inputs = parse_macro_input!(input with syn::punctuated::Punctuated::::parse_terminated); + + let nodes = inputs.iter().map(|expr| { + match expr { + Expr::Macro(ExprMacro { mac, .. }) => { + // Handle procedural macro + quote! { #mac } + } + Expr::Lit(ExprLit { lit, .. }) => match lit { + Lit::Char(c) => { + let count = gregex_logic::TERMINAL_COUNT + .fetch_add(1, core::sync::atomic::Ordering::SeqCst); + quote! { + gregex_logic::translation::node::Node::Terminal(#c, #count) + } + } + _ => panic!("Unsupported literal type"), + }, + _ => panic!("Unsupported input type"), + } + }); + + // Generate the code for concatenating nodes + let mut iter = nodes.into_iter(); + let first = iter.next().expect("The input is empty"); + let operations = iter.fold(first, |left, right| { + quote! { + gregex_logic::translation::node::Node::Operation( + gregex_logic::translation::operator::Operator::Concat, + Box::new(#left), + Some(Box::new(#right)) + ) + } + }); + + // Generate the final token stream + let gen = quote! { + #operations + }; + + gen.into() +} + +#[proc_macro] +pub fn or(input: TokenStream) -> TokenStream { + let inputs = parse_macro_input!(input with syn::punctuated::Punctuated::::parse_terminated); + + let nodes = inputs.iter().map(|expr| { + match expr { + Expr::Macro(ExprMacro { mac, .. }) => { + // Handle procedural macro + quote! { #mac } + } + Expr::Lit(ExprLit { lit, .. }) => match lit { + Lit::Char(c) => { + let count = gregex_logic::TERMINAL_COUNT + .fetch_add(1, core::sync::atomic::Ordering::SeqCst); + quote! { + gregex_logic::translation::node::Node::Terminal(#c, #count) + } + } + _ => panic!("Unsupported literal type"), + }, + _ => panic!("Unsupported input type"), + } + }); + + // Generate the code for concatenating nodes + let mut iter = nodes.into_iter(); + let first = iter.next().expect("The input is empty"); + let operations = iter.fold(first, |left, right| { + quote! { + gregex_logic::translation::node::Node::Operation( + gregex_logic::translation::operator::Operator::Or, + Box::new(#left), + Some(Box::new(#right)) + ) + } + }); + + // Generate the final token stream + let gen = quote! { + #operations + }; + + gen.into() +} + +#[proc_macro] +pub fn star(input: TokenStream) -> TokenStream { + let expr = parse_macro_input!(input as Expr); + + let node = match expr { + Expr::Macro(ExprMacro { mac, .. }) => { + // Handle procedural macro + quote! { #mac } + } + Expr::Lit(ExprLit { lit, .. }) => match lit { + Lit::Char(c) => { + let count = + gregex_logic::TERMINAL_COUNT.fetch_add(1, core::sync::atomic::Ordering::SeqCst); + quote! { + gregex_logic::translation::node::Node::Terminal(#c, #count) + } + } + _ => panic!("Unsupported literal type"), + }, + _ => panic!("Unsupported input type"), + }; + + // Generate the code for the star operation + let operation = quote! { + gregex_logic::translation::node::Node::Operation( + gregex_logic::translation::operator::Operator::Production, + Box::new(#node), + None + ) + }; + + // Generate the final token stream + let gen = quote! { + #operation + }; + + gen.into() +} + +#[proc_macro] +pub fn regex(input: TokenStream) -> TokenStream { + let expr = parse_macro_input!(input as Expr); + + // Convert the input expression into a Node structure + let node = match expr { + Expr::Macro(ExprMacro { mac, .. }) => { + // Handle procedural macro + quote! { #mac } + } + Expr::Lit(ExprLit { lit, .. }) => match lit { + Lit::Char(c) => { + let count = + gregex_logic::TERMINAL_COUNT.fetch_add(1, core::sync::atomic::Ordering::SeqCst); + quote! { + gregex_logic::translation::node::Node::Terminal(#c, #count) + } + } + _ => panic!("Unsupported literal type"), + }, + _ => panic!("Unsupported input type"), + }; + + // Generate the code to convert the Node into a Regex + let gen = quote! { + { + let regex_tree = #node; + let prefix_set = gregex_logic::translation::node::prefix_set(®ex_tree); + let suffix_set = gregex_logic::translation::node::suffix_set(®ex_tree); + let factors_set = gregex_logic::translation::node::factors_set(®ex_tree); + gregex_logic::nfa::NFA::set_to_nfa(&prefix_set, &suffix_set, &factors_set) + } + }; + + gen.into() +} diff --git a/src/lib.rs b/src/lib.rs index 1690175..465fda9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,118 +1,3 @@ -//! This crate provides a [regular expression](https://en.wikipedia.org/wiki/Regular_expression) engine that uses a [Nondeterministic finite automaton](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton) to simulate the regular expression. -//! Here is a short example on how to use this crate -//! -//! ```rust -//! extern crate gregex; -//! use gregex::*; -//! -//! fn main() { -//! let tree = dot!(star!('a'), 'b', 'c'); -//! let regex = regex(&tree); -//! assert!(regex.run("abc")); -//! assert!(!regex.run("a")); -//! assert!(regex.run("aaabc")); -//! } -//! ``` -//! -//! The regex function uses the regular expression string to create a NFA that can be used to simulate the regular expression. -//! The program uses the [Glushkov's construction algorithm](https://en.wikipedia.org/wiki/Glushkov%27s_construction_algorithm) to create its NFA. -//! The NFA is then later simulated to check if the input string matches the regular expression. -//! -//! A brief overview of the pipeline: -//! [![](https://github.com/Saphereye/gregex/blob/master/assets/gregex_workflow.excalidraw.svg)] -//! - -pub mod nfa; -pub mod translation; - -use nfa::*; -use std::sync::atomic::AtomicU32; -use translation::node::*; - -type Regex = NFA; - -/// Translates a regular expression tree to a NFA. This NFA can then be called to simulate inputs. -pub fn regex(regex_tree: &Node) -> Regex { - let prefix_set = &prefix_set(regex_tree); - let suffix_set = &suffix_set(regex_tree); - let factors_set = &factors_set(regex_tree); - NFA::set_to_nfa(prefix_set, suffix_set, factors_set) -} - -/// Keeps count of the terminals created. This is used to create unique terminals. -pub static TERMINAL_COUNT: AtomicU32 = AtomicU32::new(0); - -/// Represents the `concatenation` action in regex. Can dot multiple nodes. -/// -/// Regex: `ab` -/// Gregex: `dot!('a', 'b')` -#[macro_export] -macro_rules! dot { - ($($node:expr),+ $(,)?) => { - { - let nodes = vec![$(helper!($node)),+]; - nodes.into_iter().reduce(|left, right| { - $crate::translation::node::Node::Operation($crate::translation::operator::Operator::Concat, Box::new(left), Some(Box::new(right))) - }).expect("The input is empty") - } - }; -} - -/// Represents the `or` action in regex. Can 'or' multiple nodes. -/// -/// Regex: `a|b` -/// Gregex: `or!('a', 'b')` -#[macro_export] -macro_rules! or { - ($($node:expr),+ $(,)?) => { - { - let nodes = vec![$(helper!($node)),+]; - nodes.into_iter().reduce(|left, right| { - $crate::translation::node::Node::Operation($crate::translation::operator::Operator::Or, Box::new(left), Some(Box::new(right))) - }).expect("The input is empty") - } - }; -} - -/// Helper function to handle literals and expressions inside the [or], [star] and [dot]. -#[macro_export] -macro_rules! helper { - ($node:literal) => {{ - { - let count = $crate::TERMINAL_COUNT.fetch_add(1, core::sync::atomic::Ordering::SeqCst); - $crate::translation::node::Node::Terminal($node, count) - } - }}; - ($node:expr) => { - $node - }; -} - -/// Represents the `production` action in regex. This is a single node. -/// -/// Regex: `a*` -/// Gregex: `star!('a')` -#[macro_export] -macro_rules! star { - ($child:expr) => { - $crate::translation::node::Node::Operation( - $crate::translation::operator::Operator::Production, - Box::new(helper!($child)), - None, - ) - }; -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_regex() { - let tree = dot!(star!('a'), 'b', 'c'); - let regex = regex(&tree); - assert!(regex.run("abc")); - assert!(!regex.run("a")); - assert!(regex.run("aaabc")); - } -} +#[doc = include_str!("../README.md")] +#[cfg(not(doctest))] +pub use gregex_macros::*;