Skip to content

Commit

Permalink
Extractor is now ready
Browse files Browse the repository at this point in the history
  • Loading branch information
mhmd-azeez committed Oct 9, 2017
1 parent ed7c247 commit 3f7c1aa
Show file tree
Hide file tree
Showing 11 changed files with 160 additions and 72 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
A small web crawler used to collect Kurdish text over the web

It has these commands:
- [X] **Crawl:** used to crawl web pages and save them to a folder on disk.
- [ ] **Extract:** used to extact kurdish text from the pages that are collected from the previous command.
- [X] **Crawl:** used to crawl web pages and extact kurdish text from them and save them to a folder on disk.
- [ ] **Normalize:** used to convert the text collected in the previous command to standard unicode text.
- [ ] **Merge:** Used to merge the text files produced from the previous commands.
- [ ] **WordList:** used to make a wordlist from the text file that's produced from the previous command.
Expand Down
6 changes: 3 additions & 3 deletions src/DevTree.Crawler/App.config
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
<?xml version="1.0" encoding="utf-8" ?>
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<configSections>
<section name="log4net" type="log4net.Config.Log4NetConfigurationSectionHandler, log4net"/>
<section name="abot" type="Abot.Core.AbotConfigurationSectionHandler, Abot"/>
</configSections>

<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.6.1" />
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5"/>
</startup>

<log4net>
Expand Down Expand Up @@ -44,4 +44,4 @@
</extensionValues>
</abot>

</configuration>
</configuration>
38 changes: 28 additions & 10 deletions src/DevTree.Crawler/Crawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using Abot.Poco;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
Expand All @@ -14,29 +15,35 @@ public class Crawler
private const char IgnoreCharacter = '\0';

private string _savePath = null;
private Uri _uriToCrawl;
private Uri _seedUrl;
private int _delay;
private int _maxPages;
private List<WebPage> _webPages;
private string _statsFilePath;
private const string StatsFileName = "$Stats.txt";
public Crawler(string[] args)
{
_uriToCrawl = new Uri(ParameterHelper.GetParameter(args, "-url", "absolute url"));
_seedUrl = new Uri(ParameterHelper.GetParameter(args, "-url", "absolute url"));
_savePath = ParameterHelper.GetParameter(args, "-output", $" output path");
_delay = ParameterHelper.GetIntegerParameter(args, "-delay", 1000);
_maxPages = ParameterHelper.GetIntegerParameter(args, "-pages", 250);
_statsFilePath = ParameterHelper.GetPath(_savePath, StatsFileName);
}

public string Crawl(List<WebPage> webPages)
{
if (File.Exists(_statsFilePath) && webPages.Count == 0)
File.Delete(_statsFilePath);

_webPages = webPages ?? new List<WebPage>();

IWebCrawler crawler;

crawler = GetDefaultWebCrawler(_maxPages, _delay);

crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;

CrawlResult result = crawler.Crawl(_uriToCrawl);
CrawlResult result = crawler.Crawl(_seedUrl);

return _savePath;
}
Expand All @@ -61,26 +68,37 @@ private IWebCrawler GetDefaultWebCrawler(int maxPagesToCrawl, int delayInMillise

void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
{
var contents = e.CrawledPage.Content.Text;
var stopWatch = new Stopwatch();
stopWatch.Start();

var contents = Extractor.Extract(e.CrawledPage.Content.Text);

var page = new WebPage
{
FileName = ParameterHelper.GetPath(_savePath, (_webPages.Count + 1).ToString() + ".txt"),
Url = e.CrawledPage.Uri.AbsoluteUri
Url = e.CrawledPage.Uri.AbsoluteUri,
NumberOfWords = contents.Split(' ').Length
};

_webPages.Add(page);

IOHelper.SaveFile(page.FileName, contents);

SaveStats(page);

stopWatch.Stop();
Console.WriteLine("Pages crowled: " + _webPages.Count);
Console.WriteLine($"Page Crawled: {page.Url}, Saved to: {page.FileName}.");
Console.WriteLine($"Page Crawled: {page.Url}, Number Of Words: {page.NumberOfWords:n0}. Processed in {stopWatch.ElapsedMilliseconds:n0} ms");
}

public void SaveStats()
public void SaveStats(WebPage page)
{
var statistics = _webPages.Select(w => $"{w.Url}, {w.FileName}").ToArray();
IOHelper.SaveFile(ParameterHelper.GetPath(_savePath, StatsFileName), statistics);
var content = $"{page.Url},{page.FileName},{page.NumberOfWords}{Environment.NewLine}";

if (File.Exists(_statsFilePath))
File.AppendAllText(_statsFilePath, content);
else
IOHelper.SaveFile(_statsFilePath, content);
}

}
Expand Down
14 changes: 12 additions & 2 deletions src/DevTree.Crawler/DevTree.Crawler.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
<OutputType>Exe</OutputType>
<RootNamespace>DevTree.Crawler</RootNamespace>
<AssemblyName>crawler</AssemblyName>
<TargetFrameworkVersion>v4.6.1</TargetFrameworkVersion>
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
<NuGetPackageImportStamp>
</NuGetPackageImportStamp>
<TargetFrameworkProfile />
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<PlatformTarget>AnyCPU</PlatformTarget>
Expand Down Expand Up @@ -43,6 +44,9 @@
<Reference Include="CsQuery, Version=1.3.3.249, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\..\packages\CsQuery.1.3.4\lib\net40\CsQuery.dll</HintPath>
</Reference>
<Reference Include="DevTree.BeKurdi, Version=0.0.3.1, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\..\packages\DevTree.BeKurdi.0.0.3.1\lib\netstandard1.0\DevTree.BeKurdi.dll</HintPath>
</Reference>
<Reference Include="HtmlAgilityPack, Version=1.4.7.0, Culture=neutral, PublicKeyToken=bd319b19eaf3b43a, processorArchitecture=MSIL">
<HintPath>..\..\packages\Abot.1.5.1.69\lib\net40\HtmlAgilityPack.dll</HintPath>
</Reference>
Expand All @@ -53,7 +57,13 @@
<HintPath>..\..\packages\NRobotsPatched.1.0.8.0\lib\net40\Robots.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.ComponentModel.Composition" />
<Reference Include="System.Core" />
<Reference Include="System.IO.Compression" />
<Reference Include="System.Numerics" />
<Reference Include="System.Runtime.InteropServices.RuntimeInformation, Version=4.0.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
<HintPath>..\..\packages\System.Runtime.InteropServices.RuntimeInformation.4.3.0\lib\net45\System.Runtime.InteropServices.RuntimeInformation.dll</HintPath>
</Reference>
<Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="Microsoft.CSharp" />
Expand All @@ -63,8 +73,8 @@
</ItemGroup>
<ItemGroup>
<Compile Include="Crawler.cs" />
<Compile Include="Extractor.cs" />
<Compile Include="IOHelper.cs" />
<Compile Include="Kurdish.cs" />
<Compile Include="KurdishStringComparer.cs" />
<Compile Include="Merger.cs" />
<Compile Include="ParameterHelper.cs" />
Expand Down
65 changes: 65 additions & 0 deletions src/DevTree.Crawler/Extractor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
using AngleSharp.Parser.Html;
using DevTree.BeKurdi;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace DevTree.Crawler
{
public static class Extractor
{
private const char IgnoredChar = '\u0000';
public static string Extract(string fullText)
{
var fullCharacterSet = Kurdish.SoraniAlphabet.Union(Kurdish.NonStandardSoraniAlphabet)
.Union(Kurdish.SoraniNumbers)
.Union(new char[]
{
Kurdish.FullStop,
Kurdish.SoraniQuestionMark,
Kurdish.Space
}).ToList();

var parser = new HtmlParser();
var document = parser.Parse(fullText);
var body = document.GetElementsByTagName("body").FirstOrDefault()?.InnerHtml;

return RemoveUnwantedCharacters(fullCharacterSet, body ?? fullText);
}

private static string RemoveUnwantedCharacters(List<char> acceptableChars, string text)
{
var builder = new StringBuilder(text.Length / 2);

bool skipNextChar = false;
for (int i = 0; i < text.Length; i++)
{
var currentChar = acceptableChars.Contains(text[i]) ? text[i] : Kurdish.Space;

switch (currentChar)
{
case Kurdish.FullStop:
case Kurdish.Space:
if (!skipNextChar)
{
builder.Append(currentChar);
skipNextChar = true;
}
break;

case IgnoredChar:
skipNextChar = false;
break;
default:
builder.Append(currentChar);
skipNextChar = false;
break;
}
}

return builder.ToString();
}
}
}
36 changes: 0 additions & 36 deletions src/DevTree.Crawler/Kurdish.cs

This file was deleted.

28 changes: 15 additions & 13 deletions src/DevTree.Crawler/KurdishStringComparer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,23 @@ public class KurdishStringComparer : IComparer<string>
{
public int Compare(string first, string second)
{
if (first == null)
{
if (second == null) return 0;
return -1;
}
if (second == null) return 1;
//if (first == null)
//{
// if (second == null) return 0;
// return -1;
//}
//if (second == null) return 1;

for (int i = 0; i < first.Length && i < second.Length; i++)
{
var difference = Kurdish.Weight(first[i]) - Kurdish.Weight(second[i]);
if (difference != 0) return difference;
}
//for (int i = 0; i < first.Length && i < second.Length; i++)
//{
// var difference = Kurdish.Weight(first[i]) - Kurdish.Weight(second[i]);
// if (difference != 0) return difference;
//}

if (first.Length == second.Length) return 0;
return first.Length < second.Length ? -1 : 1;
//if (first.Length == second.Length) return 0;
//return first.Length < second.Length ? -1 : 1;

return 0;
}
}
}
5 changes: 1 addition & 4 deletions src/DevTree.Crawler/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,19 @@ class Program
{
static void Main(string[] args)
{
log4net.Config.XmlConfigurator.Configure();

if (args.Length >= 1)
{
switch (args[0].ToLower())
{
case "crawl":
var seedPagesFile = ParameterHelper.GetParameter(args, "-seed", "seed pages file");
var seedPages = File.ReadAllLines(seedPagesFile);
var savePath = ParameterHelper.GetParameter(args, "-output", $" output path");

var webPages = new List<WebPage>();
foreach (var page in seedPages)
{
var pageCrawler = new Crawler(args.Union(new string[] { "-url", page }).ToArray());
pageCrawler.Crawl(webPages);
pageCrawler.SaveStats();
}

break;
Expand Down
4 changes: 2 additions & 2 deletions src/DevTree.Crawler/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,5 @@
// You can specify all the values or you can default the Build and Revision Numbers
// by using the '*' as shown below:
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("0.3.0.0")]
[assembly: AssemblyFileVersion("0.3.0.0")]
[assembly: AssemblyVersion("0.4.0.0")]
[assembly: AssemblyFileVersion("0.4.0.0")]
1 change: 1 addition & 0 deletions src/DevTree.Crawler/WebPage.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ public class WebPage
{
public string Url { get; set; }
public string FileName { get; set; }
public int NumberOfWords { get; set; }
}
}
32 changes: 32 additions & 0 deletions src/DevTree.Crawler/packages.config
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,40 @@
<package id="Abot" version="1.5.1.69" targetFramework="net461" />
<package id="AngleSharp" version="0.9.9" targetFramework="net461" />
<package id="CsQuery" version="1.3.4" targetFramework="net461" />
<package id="DevTree.BeKurdi" version="0.0.3.1" targetFramework="net45" />
<package id="log4net" version="2.0.7" targetFramework="net461" />
<package id="Microsoft.Bcl" version="1.1.10" targetFramework="net461" />
<package id="Microsoft.Bcl.Build" version="1.0.21" targetFramework="net461" />
<package id="Microsoft.NETCore.Platforms" version="1.1.0" targetFramework="net45" />
<package id="NETStandard.Library" version="1.6.1" targetFramework="net45" />
<package id="NRobotsPatched" version="1.0.8.0" targetFramework="net461" />
<package id="System.Collections" version="4.3.0" targetFramework="net45" />
<package id="System.Collections.Concurrent" version="4.3.0" targetFramework="net45" />
<package id="System.Diagnostics.Debug" version="4.3.0" targetFramework="net45" />
<package id="System.Diagnostics.Tools" version="4.3.0" targetFramework="net45" />
<package id="System.Diagnostics.Tracing" version="4.3.0" targetFramework="net45" />
<package id="System.Globalization" version="4.3.0" targetFramework="net45" />
<package id="System.IO" version="4.3.0" targetFramework="net45" />
<package id="System.IO.Compression" version="4.3.0" targetFramework="net45" />
<package id="System.Linq" version="4.3.0" targetFramework="net45" />
<package id="System.Linq.Expressions" version="4.3.0" targetFramework="net45" />
<package id="System.Net.Http" version="4.3.0" targetFramework="net45" />
<package id="System.Net.Primitives" version="4.3.0" targetFramework="net45" />
<package id="System.ObjectModel" version="4.3.0" targetFramework="net45" />
<package id="System.Reflection" version="4.3.0" targetFramework="net45" />
<package id="System.Reflection.Extensions" version="4.3.0" targetFramework="net45" />
<package id="System.Reflection.Primitives" version="4.3.0" targetFramework="net45" />
<package id="System.Resources.ResourceManager" version="4.3.0" targetFramework="net45" />
<package id="System.Runtime" version="4.3.0" targetFramework="net45" />
<package id="System.Runtime.Extensions" version="4.3.0" targetFramework="net45" />
<package id="System.Runtime.InteropServices" version="4.3.0" targetFramework="net45" />
<package id="System.Runtime.InteropServices.RuntimeInformation" version="4.3.0" targetFramework="net45" />
<package id="System.Runtime.Numerics" version="4.3.0" targetFramework="net45" />
<package id="System.Text.Encoding" version="4.3.0" targetFramework="net45" />
<package id="System.Text.Encoding.Extensions" version="4.3.0" targetFramework="net45" />
<package id="System.Text.RegularExpressions" version="4.3.0" targetFramework="net45" />
<package id="System.Threading" version="4.3.0" targetFramework="net45" />
<package id="System.Threading.Tasks" version="4.3.0" targetFramework="net45" />
<package id="System.Xml.ReaderWriter" version="4.3.0" targetFramework="net45" />
<package id="System.Xml.XDocument" version="4.3.0" targetFramework="net45" />
</packages>

0 comments on commit 3f7c1aa

Please sign in to comment.