-
Notifications
You must be signed in to change notification settings - Fork 0
/
categorizer.py
52 lines (45 loc) · 1.74 KB
/
categorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from ktrain import load_predictor
from loguru import logger
categorizer_model = './model/model'
expected_domains = {
'politics': ['politics'],
'sports': ['sports', 'sport', 'basketball', 'tennis', 'football'],
'crime': ['crime', 'law'],
'economy': ['economy', 'finance', 'business', 'markets', 'market', 'forbes'],
'entertainment': ['entertainment', 'music', 'film', 'movie', 'hollywood', 'celebrity', 'celebrity-news'],
'life': ['lifestyle', 'life-style', 'life', 'health'],
'science and technology': ['technology', 'product', 'science', 'tech'],
'uncategorized': ['uncategorized'],
}
model = load_predictor(categorizer_model)
logger.info('Model is loaded.')
def get_category_from_url(url):
splitted_url = url.split('/')
categories = splitted_url[3:-1]
response = {}
response['url'] = url
for domain, subdomains in expected_domains.items():
if not categories:
for category in categories:
if any(category == subdomain for subdomain in subdomains):
response['category'] = domain
return response
else:
if any(subdomain in url for subdomain in subdomains):
response['category'] = domain
return response
response['category'] = "unknown"
return response
def get_category_from_content(content):
'''
Only supports ['business', 'entertainment', 'politics', 'sport', 'tech']
:param content:
:type content:
:return:
:rtype:
'''
content = content.lower()
logger.debug("Content {}".format(content))
prediction = str(model.predict(content))
logger.debug("Predicted category is {}".format(prediction))
return {'content': content, 'category': prediction}