forked from Apollo1840/Happy-Crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbbc_crawler.py
45 lines (33 loc) · 1.21 KB
/
bbc_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from basic_crawler import basic_crawler
class Crawler_bbc(basic_crawler):
''' this class is just a branch of methods.
'''
def __init__(self):
super(Crawler_bbc, self).__init__('https://www.bbc.com/news')
def header_link(self, file_path):
# this method will write headers and their links into the file_path you defined, like: 'material/news.txt'
# eg:
# Low-carb diets 'could shorten life'
# https://www.bbc.com/news/health-45195474
with open(file_path, 'w') as f:
for a in self.soup.find_all('a'):
try:
f.write(a.h3.text)
f.write('\n')
f.write(site_fullname(a['href'],'https://www.bbc.com'))
f.write('\n')
f.write('\n')
except Exception:
pass
def site_fullname(href, head):
if href.startswith('http'):
return href
else:
return head + href
if __name__ == '__main__':
crawler = Crawler_bbc()
print(crawler.html)
crawler.header_link('material/news.txt')