-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ed7c247
commit 3f7c1aa
Showing
11 changed files
with
160 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
using AngleSharp.Parser.Html; | ||
using DevTree.BeKurdi; | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Text; | ||
using System.Threading.Tasks; | ||
|
||
namespace DevTree.Crawler | ||
{ | ||
public static class Extractor | ||
{ | ||
private const char IgnoredChar = '\u0000'; | ||
public static string Extract(string fullText) | ||
{ | ||
var fullCharacterSet = Kurdish.SoraniAlphabet.Union(Kurdish.NonStandardSoraniAlphabet) | ||
.Union(Kurdish.SoraniNumbers) | ||
.Union(new char[] | ||
{ | ||
Kurdish.FullStop, | ||
Kurdish.SoraniQuestionMark, | ||
Kurdish.Space | ||
}).ToList(); | ||
|
||
var parser = new HtmlParser(); | ||
var document = parser.Parse(fullText); | ||
var body = document.GetElementsByTagName("body").FirstOrDefault()?.InnerHtml; | ||
|
||
return RemoveUnwantedCharacters(fullCharacterSet, body ?? fullText); | ||
} | ||
|
||
private static string RemoveUnwantedCharacters(List<char> acceptableChars, string text) | ||
{ | ||
var builder = new StringBuilder(text.Length / 2); | ||
|
||
bool skipNextChar = false; | ||
for (int i = 0; i < text.Length; i++) | ||
{ | ||
var currentChar = acceptableChars.Contains(text[i]) ? text[i] : Kurdish.Space; | ||
|
||
switch (currentChar) | ||
{ | ||
case Kurdish.FullStop: | ||
case Kurdish.Space: | ||
if (!skipNextChar) | ||
{ | ||
builder.Append(currentChar); | ||
skipNextChar = true; | ||
} | ||
break; | ||
|
||
case IgnoredChar: | ||
skipNextChar = false; | ||
break; | ||
default: | ||
builder.Append(currentChar); | ||
skipNextChar = false; | ||
break; | ||
} | ||
} | ||
|
||
return builder.ToString(); | ||
} | ||
} | ||
} |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters