Skip to content
This repository has been archived by the owner on Nov 26, 2022. It is now read-only.

Commit

Permalink
finishing lexer
Browse files Browse the repository at this point in the history
  • Loading branch information
Lotes committed Apr 15, 2018
1 parent acca672 commit a7f0d83
Show file tree
Hide file tree
Showing 19 changed files with 240 additions and 45 deletions.
15 changes: 9 additions & 6 deletions Lexer/Automaton/AutomatonExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,21 @@ public static bool Read(this IAutomaton @this, string input)
var state = new HashSet<int>(@this.GetEpsilonClosure(@this.StartState));
foreach(var c in input)
{
state = new HashSet<int>(
@this.GetEpsilonClosure(
state.SelectMany(s => @this.TransitionsBySource
.GetOrDefault(s, EmptyTargets)
.ReadChar(c))
.ToArray()));
state = new HashSet<int>(@this.Step(state, c));
if (!state.Any())
return false;
}
return @this.AcceptingStates.Intersect(state).Any();
}

public static IEnumerable<int> Step(this IAutomaton @this, IEnumerable<int> state, char character)
{
return @this.GetEpsilonClosure(state.SelectMany(s => @this.TransitionsBySource
.GetOrDefault(s, EmptyTargets)
.ReadChar(character))
.ToArray());
}

public static void Print(this IAutomaton @this)
{
Console.WriteLine($"start: {@this.StartState}");
Expand Down
2 changes: 1 addition & 1 deletion Lexer/Automaton/CharSet.cs
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ IEnumerator IEnumerable.GetEnumerator()

public override string ToString()
{
return string.Join(",", list.Select(r => r.ToString()));
return string.Join(",", list == null ? new string[] { } : list.Where(r => r.Mode == SetMode.Included).Select(r => r.ToString()));
}

public override int GetHashCode()
Expand Down
9 changes: 9 additions & 0 deletions Lexer/CharType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
namespace Lexer
{
public enum CharType
{
Invalid,
Special,
Literal
}
}
13 changes: 13 additions & 0 deletions Lexer/ILexer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Lexer
{
public interface ILexer
{
bool Read(string input, int index, out IToken token);
}
}
4 changes: 2 additions & 2 deletions Lexer/RegularExpression/IParser.cs → Lexer/IRegexParser.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
using Lexer.Automaton;

namespace Lexer.RegularExpression
namespace Lexer
{
public interface IParser
public interface IRegexParser
{
IAutomaton Parse(string input);
}
Expand Down
9 changes: 9 additions & 0 deletions Lexer/IToken.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
namespace Lexer
{
public interface IToken
{
int Index { get; }
string Value { get; }
ITokenType TokenType { get; }
}
}
11 changes: 11 additions & 0 deletions Lexer/ITokenType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
using Lexer.Automaton;

namespace Lexer
{
public interface ITokenType
{
IAutomaton Automaton { get; }
string Name { get; }
int Priority { get; }
}
}
43 changes: 43 additions & 0 deletions Lexer/Lexer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using Lexer.Automaton;
using System.Collections.Generic;
using System.Linq;

namespace Lexer
{
public class Lexer : ILexer
{
private ITokenType[] types;
public Lexer(IEnumerable<ITokenType> types)
{
this.types = types.ToArray();
}
public bool Read(string input, int index, out IToken token)
{
foreach(var type in types.OrderBy(tt => tt.Priority))
{
var automaton = type.Automaton;
HashSet<int> previousStates = null;
var states = new HashSet<int>() { automaton.StartState };
var length = 0;
for(var charIndex = index; states.Any() && charIndex < input.Length; charIndex++)
{
previousStates = states;
states = new HashSet<int>(automaton.Step(states, input[charIndex]));
length++;
}
if (!states.Any() && previousStates.Any(s => automaton.AcceptingStates.Contains(s)))
{
token = new Token(type, input.Substring(index, length-1), index);
return true;
}
else if(states.Any(s => automaton.AcceptingStates.Contains(s)))
{
token = new Token(type, input.Substring(index, length), index);
return true;
}
}
token = null;
return false;
}
}
}
21 changes: 21 additions & 0 deletions Lexer/LexerExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using System;
using System.Collections.Generic;

namespace Lexer
{
public static class LexerExtensions
{
public static IEnumerable<IToken> Read(this ILexer lexer, string input)
{
var index = 0;
IToken token;
while (index < input.Length && lexer.Read(input, index, out token))
{
yield return token;
index += token.Value.Length;
}
if (index < input.Length)
throw new InvalidOperationException("EOF not reached!");
}
}
}
18 changes: 18 additions & 0 deletions Lexer/Regex.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
using Lexer.Automaton;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Lexer
{
public static class Regex
{
private static IRegexParser parser = new RegexParser();
public static IAutomaton Parse(string input)
{
return parser.Parse(input);
}
}
}
17 changes: 5 additions & 12 deletions Lexer/RegularExpression/Impl/Parser.cs → Lexer/RegexParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,15 @@
using System.Xml.Schema;
using Lexer.Automaton;

namespace Lexer.RegularExpression.Impl
namespace Lexer
{
public enum CharType
public class RegexParser: IRegexParser
{
Invalid,
Special,
Literal
}

public class Parser: IParser
{
public static readonly CharSet HexChars = new CharSet(new CharRange('0', '9'), new CharRange('a', 'f'), new CharRange('A', 'Z'));
public static readonly ICharSet HexChars = new CharSet(new CharRange('0', '9'), new CharRange('a', 'f'), new CharRange('A', 'Z'));
public static readonly Dictionary<string, ICharSet> escapesTo = new Dictionary<string, ICharSet>();
public static readonly Dictionary<char, CharType> asciiTable = new Dictionary<char, CharType>();

static Parser()
static RegexParser()
{
asciiTable['\u0000'] = CharType.Invalid;
asciiTable['\u0001'] = CharType.Invalid;
Expand Down Expand Up @@ -279,7 +272,7 @@ private bool MayConsume(char c)
return true;
}

private bool MayConsume(CharSet c)
private bool MayConsume(ICharSet c)
{
if (!c.Includes(Lookahead)) return false;
index++;
Expand Down
7 changes: 0 additions & 7 deletions Lexer/RegularExpression/IRegularExpression.cs

This file was deleted.

19 changes: 19 additions & 0 deletions Lexer/Token.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
namespace Lexer
{
public class Token : IToken
{
public Token(ITokenType type, string value, int index)
{
Index = index;
Value = value;
TokenType = type;
}
public int Index { get; }
public string Value { get; }
public ITokenType TokenType { get; }
public override string ToString()
{
return "\""+Value+"\": "+TokenType.Name;
}
}
}
20 changes: 20 additions & 0 deletions Lexer/TokenType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
using Lexer.Automaton;

namespace Lexer
{
public class TokenType : ITokenType
{
public TokenType(string name, IAutomaton automaton, int priority)
{
Name = name;
Automaton = automaton;
Priority = priority;
}

public IAutomaton Automaton { get; }

public string Name { get; }

public int Priority { get; }
}
}
7 changes: 2 additions & 5 deletions Main/Program.cs
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
using System;
using Lexer;
using Lexer.Automaton;
using Lexer.RegularExpression.Impl;

namespace Main
{
public class Program
{
public static void Main(string[] args)
{
new Parser().Parse(".");

var parser = new Parser();
var parser = new RegexParser();
do
{
Console.WriteLine("Please enter a regular expression!");
Expand All @@ -21,7 +19,6 @@ public static void Main(string[] args)
try
{
var automaton = parser.Parse(input);
Console.WriteLine(automaton.StateCount+" states");
automaton.Print();
}
catch (Exception e)
Expand Down
32 changes: 32 additions & 0 deletions Tests/UnitTests/LexerTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
using Lexer;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Tests.UnitTests
{
[TestClass]
public class LexerTests
{
[TestMethod]
public void TestLexer()
{
var typeNumbers = new TokenType("NUMBER", Regex.Parse("\\d+"), 1);
var typeId = new TokenType("ID", Regex.Parse("[a-zA-Z_][a-zA-Z0-9_]*"), 1);
var typeWhitespace = new TokenType("SPACE", Regex.Parse("\\s"), 1);
var lexer = new Lexer.Lexer(new[]
{
typeNumbers,
typeId,
typeWhitespace
});
foreach(var token in lexer.Read("Hallo Du 3"))
{
Console.WriteLine(token);
}
}
}
}
6 changes: 3 additions & 3 deletions Tests/UnitTests/RegexParserTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
using Lexer.Automaton;
using Lexer.RegularExpression.Impl;
using Lexer;
using Lexer.Automaton;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System;
using System.Collections.Generic;
Expand All @@ -14,7 +14,7 @@ public class RegexParserTests
{
private IAutomaton ParseRegex(string input)
{
return new Parser().Parse(input);
return Regex.Parse(input);
}

[TestMethod]
Expand Down
32 changes: 23 additions & 9 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,32 @@ Basically it is a certain class of state machine. A deterministic finite automat

![Example](1-nfa-example.gv.png)

Circles represent state. Double cycles are accepting states. Transitions are labelled with input symbols. The input symbol `&epsilon;` is the `EMPTY` input (no character).
Cycles represent state. Double cycles are accepting states. Transitions are labelled with input symbols. The input symbol `&epsilon;` is the `EMPTY` input (no character).

### The `IAutomaton` Interface

```csharp
public interface IAutomaton
{
int StartState { get; }
int StateCount { get; }
ISet<int> AcceptingStates { get; }
//source -> input symbol -> targets
IReadOnlyDictionary<int, IReadOnlyDictionary<char, ISet<int>>> TransitionsBySource { get; }
int StartState { get; }
int StateCount { get; }
ISet<int> AcceptingStates { get; }
IReadOnlyDictionary<int, ITransitionTargets> TransitionsBySource { get; }
}

public interface ITransitionTargets: ILookup<ICharSet, int>
{
bool Contains(char c);
bool ContainsEpsilon();
}

public interface ICharSet : IEnumerable<CharRange>, IComparable<ICharSet>
{
int Length { get; }
bool Includes(char c);
bool Includes(char from, char to);
bool Excludes(char c);
bool Excludes(char from, char to);
}
```

Expand Down Expand Up @@ -100,10 +114,10 @@ Great the automata resulting from a regular expression are complete and minimal.

## Finally: The lexer!

The task now is to transform a text with a set of regular expressions into ordered set of tokens. So, the trivial approach would be a for loop over the set of regular expressions until one does match. But this is boring. I will combine all regular expressions with the alternate operator `|`. All I need is a feedback which regular expression has matched and how far (when multiple expressions matches, tell me all of them).
The task now is to transform a text with a set of regular expressions into ordered set of tokens. So, the trivial approach is a for loop over the set of regular expressions until one does match.

```csharp

```
xyz
```

## Further readings
Expand Down
File renamed without changes.

0 comments on commit a7f0d83

Please sign in to comment.