-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.pl
132 lines (95 loc) · 2.61 KB
/
scraper.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/perl
use 5.010;
use strict;
use warnings;
use HTML::Entities;
use Mojo::UserAgent;
use LWP::Simple;
use File::Path qw(make_path);
$username = "Interim P";
$password = "Interim P";
if (@ARGV != 2){
say STDERR "Gebruik: perl intranet_scraper.pl [reeks_nummer] [hoogste_nummer_oefening] [type: .pl of .ps1]";
exit 1;
}
my ($reeks_no,$aantal_oef,$url_postfix) = @ARGV;
##### IO SETUP ########################################
# Directories
my $dir_output = 'output';
# Create directories
make_path $dir_output unless -d $dir_output;
##### SCRAPER SETUP ###################################
# FIFO queue for URLs
my @urls;
my $url_prefix = "https://$username:$password\@intranet.tiwi.ugent.be/Besturingssystemen-III/Labo/p$reeks_no/";
for my $i (1..$aantal_oef){
my $url = $url_prefix;
$url .= "0" if($i<10);
$url .= "$i$url_postfix";
push @urls,$url;
}
my $total_urls = 0;
# Limit parallel connections
my $max_con = 20;
# User agent following up to 5 redirects
my $ua = Mojo::UserAgent->new(max_redirects => 5);
# Keep track of active connections
my $active = 0;
##### MAIN ############################################
Mojo::IOLoop->recurring(
0 => sub {
for($active + 1 .. $max_con){
# Dequeue or halt if there are no active crawlers anymore
my $url;
unless ($url = shift @urls){
return ($active or end_scraping());
}
# Fetch non-blocking just by adding a callback and marking as active
++$active;
get_callback($ua->get($url));
}
}
);
# Start event loop if necessary
Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
##### MAIN END ########################################
##### SUBS ############################################
## Check callback if URL is valid and parse html if so
sub get_callback{
my ($tx) = @_;
# Deactivate
--$active;
# Only use OK HTML responses
return
if not $tx->res->is_status_class(200);
# Request URL
my $url = $tx->req->url;
say $url;
++$total_urls;
parse_html($url,$tx);
return;
}
## Scrape and process HTML code of given url
sub parse_html{
my ($url,$tx) = @_;
my $html = $tx->res->to_string;
my $body;
$body = $1 if ($html =~ m((?:.*?)\n\s*\n(.+))s);
# Write data to file
to_file($url,$body);
return;
}
## Write scraped HTML to file for later testing
sub to_file{
my ($url,$body) = @_;
my $id = $1 if($url =~ m(^(?:.+)/(.+?).pl$));
open(my $out,">","$dir_output/reeks$reeks_no/Reeks$reeks_no\_$id.pl") or die "Failed to open outputfile: $!";
say $out $body;
close($out);
return;
}
## End the scraping process and execute any followup code
sub end_scraping{
Mojo::IOLoop->stop if Mojo::IOLoop->is_running;
exit(0);
}