-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.jl
66 lines (51 loc) · 2 KB
/
run.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#=
Copyright (C) 2023 Yiding Song
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
=#
using Downloads
include("parser.jl")
include("wikigraph.jl")
function dunzip(fname)
run(`curl https://dumps.wikimedia.org/enwiki/20230101/$fname --output $fname`)
run(`7z x $fname -odata`)
rm(fname)
end
files = [String(strip(i)) for i in split(read("data/multistream-urls.txt", String), "\n")]
start = length(ARGS) == 0 ? 1 : parse(Int64, ARGS[1])
for i = start:length(files)
if !ispath("logs/$(i)")
mkdir("logs/$(i)")
elseif ispath("logs/$(i)/title_errors.txt")
rm("logs/$(i)/title_errors.txt")
end
zipname = files[i]
xmlname = String(split(zipname, ".bz2")[1])
numpages = parse(Int64, split(xmlname, "p")[end]) - parse(Int64, split(xmlname, "p")[end-1]) + 1
println("\n[$(i)/$(length(files))] Starting on $(numpages) pages from $(zipname)")
if !ispath("data/$(xmlname)")
dunzip(zipname)
end
# Loads wg from `graph/`, mines XML to update wg, and saves it back into `graph`
wg = mineXML(
"data/$(xmlname)",
"graph/",
"data/enwiki-20230101-all-titles-in-ns0",
"logs/$(i)/title_errors.txt",
numpages
)
rm("data/$(xmlname)")
end
fwg = loadwg("graph/", "data/enwiki-20230101-all-titles-in-ns0")
savewgQuick("graph/", fwg)
bwg = loadwg("graph/", "data/enwiki-20230101-all-titles-in-ns0"; backwards=true)
savewg("backgraph/", bwg)
savewgQuick("backgraph/", bwg)