Skip to content

Commit

Permalink
Accept one url instead of a list of seeds
Browse files Browse the repository at this point in the history
Don't crawl a page more than once
  • Loading branch information
mhmd-azeez committed Oct 9, 2017
1 parent 3f7c1aa commit 26d87f2
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 39 deletions.
16 changes: 5 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,17 @@ It has these commands:

### Crawl
```
./crawler.exe crawl -seed <seed> -output <output> [-delay <delay>] [-pages <pages>]
./crawler.exe crawl -url <url> -output <output> [-delay <delay>] [-pages <pages>]
```

#### Parameters:
- `seed`: The text file containing the **absolute** URLs of the seed pages. The more links a seed page contains, the better. The URLs in the text file should be seperated by a line break, like so:

```
https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Wikipedia:About
...
```
- `url`: The **absolute** URL for the site you want to crawl.
- `output`: The folder to save the crawled pages. The crawler will also save a `$Stats.txt` file that contains the crawling stats.
- `delay`: Number of milliseconds to wait between crawling two pages. Default value is `1000`
- `pages`: Maximum number of pages to crawl for **each** seed page. Default value is `250`
- `pages`: Maximum number of pages to crawl. Default value is `250`

#### Examples:
```
./crawler.exe crawl -seed ./seed.txt -output ./Data
./crawler.exe crawl -seed ./seed.txt -output D:\CrawledPages\ -delay 250 -pages 1000
./crawler.exe crawl -url https://ckb.wikipedia.org -output ./Data
./crawler.exe crawl -url https://www.google.iq/ -output D:/CrawledPages/ -delay 250 -pages 1000
```
36 changes: 20 additions & 16 deletions src/DevTree.Crawler/Crawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,35 @@ public class Crawler
private const char IgnoreCharacter = '\0';

private string _savePath = null;
private Uri _seedUrl;
private int _delay;
private int _maxPages;
private List<WebPage> _webPages;
private HashSet<WebPage> _webPages;
private string _statsFilePath;
private const string StatsFileName = "$Stats.txt";
private Uri _seedUrl;
public Crawler(string[] args)
{
_seedUrl = new Uri(ParameterHelper.GetParameter(args, "-url", "absolute url"));
_savePath = ParameterHelper.GetParameter(args, "-output", $" output path");
_delay = ParameterHelper.GetIntegerParameter(args, "-delay", 1000);
_maxPages = ParameterHelper.GetIntegerParameter(args, "-pages", 250);
_statsFilePath = ParameterHelper.GetPath(_savePath, StatsFileName);
_seedUrl = new Uri(ParameterHelper.GetParameter(args, "-url", " seed page"));
}

public string Crawl(List<WebPage> webPages)
public HashSet<WebPage> Crawl()
{
if (File.Exists(_statsFilePath) && webPages.Count == 0)
if (File.Exists(_statsFilePath))
File.Delete(_statsFilePath);

_webPages = webPages ?? new List<WebPage>();

IWebCrawler crawler;

crawler = GetDefaultWebCrawler(_maxPages, _delay);
var equalityComparer = ProjectionEqualityComparer.Create<WebPage, string>(page => page.Url);
_webPages = new HashSet<WebPage>(equalityComparer);

var crawler = GetDefaultWebCrawler(_maxPages, _delay);
crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;

CrawlResult result = crawler.Crawl(_seedUrl);

return _savePath;
return _webPages;
}

private IWebCrawler GetDefaultWebCrawler(int maxPagesToCrawl, int delayInMilliseconds)
Expand All @@ -71,16 +69,22 @@ void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
var stopWatch = new Stopwatch();
stopWatch.Start();

var contents = Extractor.Extract(e.CrawledPage.Content.Text);

var page = new WebPage
{
FileName = ParameterHelper.GetPath(_savePath, (_webPages.Count + 1).ToString() + ".txt"),
Url = e.CrawledPage.Uri.AbsoluteUri,
NumberOfWords = contents.Split(' ').Length
Url = e.CrawledPage.Uri.AbsoluteUri
};

_webPages.Add(page);
var success = _webPages.Add(page);

if (!success)
{
Console.WriteLine("This page ha already been crawled...");
return;
}

var contents = Extractor.Extract(e.CrawledPage.Content.Text);
page.NumberOfWords = contents.Split(' ').Length;

IOHelper.SaveFile(page.FileName, contents);

Expand Down
12 changes: 2 additions & 10 deletions src/DevTree.Crawler/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,8 @@ static void Main(string[] args)
switch (args[0].ToLower())
{
case "crawl":
var seedPagesFile = ParameterHelper.GetParameter(args, "-seed", "seed pages file");
var seedPages = File.ReadAllLines(seedPagesFile);

var webPages = new List<WebPage>();
foreach (var page in seedPages)
{
var pageCrawler = new Crawler(args.Union(new string[] { "-url", page }).ToArray());
pageCrawler.Crawl(webPages);
}

var pageCrawler = new Crawler(args);
var webPages = pageCrawler.Crawl();
break;

default:
Expand Down
4 changes: 2 additions & 2 deletions src/DevTree.Crawler/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,5 @@
// You can specify all the values or you can default the Build and Revision Numbers
// by using the '*' as shown below:
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("0.4.0.0")]
[assembly: AssemblyFileVersion("0.4.0.0")]
[assembly: AssemblyVersion("0.5.0.0")]
[assembly: AssemblyFileVersion("0.5.0.0")]

0 comments on commit 26d87f2

Please sign in to comment.