-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.php
56 lines (38 loc) · 1.23 KB
/
scrape.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
<?php
define('LOCAL_FILE', getcwd() . '/mdn-reference.html');
define('OUTPUT_FILE', getcwd() . '/mdn-reference.json');
if (!file_exists(LOCAL_FILE)) {
$cnxn = curl_init();
curl_setopt($cnxn, CURLOPT_URL, "https://developer.mozilla.org/en-US/docs/Web/HTML/Element");
curl_setopt($cnxn, CURLOPT_RETURNTRANSFER, 1);
$html = curl_exec($cnxn);
curl_close($cnxn);
$fh = fopen(LOCAL_FILE, 'w');
fwrite($fh, $html);
fclose($fh);
}
$dom = new DOMDocument();
$dom->loadHTMLFile(LOCAL_FILE, LIBXML_NOWARNING | LIBXML_NOERROR);
$xpath = new DOMXPath($dom);
$headings = $xpath->query('//article/h2');
$list = [];
foreach ($headings as $heading) {
$section = [
'title' => $heading->nodeValue,
'elements' => []
];
echo "\n" . $heading->nodeValue . "\n";
$elements = $xpath->query('table/tbody/tr', $heading->nextSibling);
foreach ($elements as $element) {
echo '.';
$section['elements'][] = [
'name' => $xpath->query('td[1]', $element)[0]->nodeValue,
'description' => $xpath->query('td[2]', $element)[0]->nodeValue,
];
}
$list[] = $section;
}
$fh = fopen(OUTPUT_FILE, 'w');
fwrite($fh, json_encode($list));
fclose($fh);
echo "\n\n";