-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathml_nlpreader.py
198 lines (179 loc) · 10.2 KB
/
ml_nlpreader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#! /home/orville/venv/devel/bin/python3
import requests
from requests import Request, Session
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from datetime import datetime, date
import hashlib
import pandas as pd
import numpy as np
import re
import logging
import argparse
import time
from rich import print
from ml_yahoofinews import yfnews_reader
from ml_urlhinter import url_hinter
from y_topgainers import y_topgainers
# ML / NLP section #############################################################
class ml_nlpreader:
"""
Class to identify, rank, classify stocks NEWS articles
"""
# global accessors
args = [] # class dict to hold global args being passed in from main() methods
yfn = None # Yahoo Finance News reader instance
mlnlp_uh = None # URL Hinter instance
yti = 0
cycle = 0 # class thread loop counter
def __init__(self, yti, global_args):
cmi_debug = __name__+"::"+self.__init__.__name__
logging.info( f'%s - Instantiate.#{yti}' % cmi_debug )
self.args = global_args # Only set once per INIT. all methods are set globally
self.yti = yti
yfn = yfnews_reader(1, "IBM", global_args ) # instantiate a class of fyn with dummy info
return
########################################## 1 #############################################
# method 1
def nlp_read_all(self, global_args):
"""
The machine will read now!
Read finance.yahoo.com / News 'Brief headlines' (i.e. short text docs)
Reads new for ALL stock symbols in the Top Gainers DF table.
"""
self.args = global_args
if self.args['bool_news'] is True: # read ALL news for top 10 gainers
cmi_debug = __name__+"::"+self.nlp_read_all.__name__+".#"+str(self.yti)
logging.info( f'%s - Instantiate.#{self.yti}' % cmi_debug )
print ( " " )
print ( "========================= ML (NLP) / Yahoo Finance News Sentiment AI =========================" )
print ( f"Build NLP test dataset / for Top Gainers..." )
newsai_test_dataset = y_topgainers(2) # instantiate class
newsai_test_dataset.get_topg_data() # extract data from finance.Yahoo.com
nx = newsai_test_dataset.build_tg_df0() # build entire dataframe
newsai_test_dataset.build_top10() # build top 10 gainers
print ( " " )
yfn = yfnews_reader(1, "IBM", self.args ) # dummy symbol just for instantiation
yfn.init_dummy_session('https://www.finance.yahoo.com')
uh = url_hinter(1, self.args) # anyone needs to be able to get hints on a URL from anywhere
yfn.share_hinter(uh) # share the url hinter available
print ( "============================== Prepare bulk NLP candidate list =================================" )
print ( f"ML/NLP candidates: {newsai_test_dataset.tg_df1['Symbol'].tolist()}" )
for nlp_target in newsai_test_dataset.tg_df1['Symbol'].tolist():
yfn.update_headers(nlp_target)
yfn.form_url_endpoint(nlp_target)
yfn.do_simple_get()
yfn.scan_news_feed(nlp_target, 0, 1, 0) #depth = 0, redner = Javascript render engine
yfn.eval_article_tags(nlp_target) # ml_ingest{} is built
print ( "============================== NLP candidates are ready =================================" )
self.nlp_summary(2)
print ( f" " )
print ( " " )
print ( "========================= Tech Events performance Sentiment =========================" )
return
########################################## 2 #############################################
# method #2
def nlp_read_one(self, news_symbol, global_args):
"""
The machine will now read !
Read finance.yahoo.com / News 'Brief headlines' (i.e. short text docs)
Reads ALL news artivles for only ONE stock symbol.
"""
self.args = global_args
cmi_debug = __name__+"::"+self.nlp_read_one.__name__+".#"+str(self.yti)
logging.info( f'%s - IN.#{self.yti}' % cmi_debug )
news_symbol = str(self.args['newsymbol']) # symbol provided on CMDLine
print ( " " )
print ( f"ML (NLP) / News Sentiment for 1 symbol [ {news_symbol} ] =========================" )
self.yfn = yfnews_reader(1, news_symbol, self.args ) # create instance of YFN News reader
self.yfn.init_dummy_session('https://www.finance.yahoo.com')
hpath = '/quote/' + news_symbol + '/news?p=' + news_symbol
self.yfn.update_headers(hpath)
self.yfn.form_url_endpoint(news_symbol)
hash_state = self.yfn.do_js_get(0) # get() & process the page html/JS data
self.mlnlp_uh = url_hinter(1, self.args) # create instance of urh hinter
self.yfn.yfn_uh = self.mlnlp_uh # send it outside to our YFN News reader instance
# args: symbol | Depth | html/JS | data_page_index
self.yfn.scan_news_feed(news_symbol, 0, 1, 0, hash_state)
self.yfn.eval_news_feed_stories(news_symbol) # ml_ingest{} get built here
print ( f" " )
return
####################################### 3 ##########################################
# method 3
def nlp_summary(self, yti, ml_idx):
"""
**CRTIICAL: Assumes ml_ingest has already been pre-populated
Reads 1 from the ml_ingest{} and processes it...
"""
self.yti = yti
cmi_debug = __name__+"::"+self.nlp_summary.__name__+".#"+str(self.yti)
logging.info( f'%s - IN.#{yti}' % cmi_debug )
logging.info('%s - ext get request pre-processed by cookiemonster...' % cmi_debug )
locality_code = {
0: 'Local 0',
1: 'Local 1',
2: 'Local 2',
3: 'Remote',
9: 'Unknown locality'
}
print ( " ")
#print ( f"============================ NLP Candidate Summary ============================" )
#for sn_idx, sn_row in self.yfn.ml_ingest.items():
sn_row = self.yfn.ml_ingest[ml_idx]
if sn_row['type'] == 0: # REAL news, inferred from Depth 0
print( f"{sn_row['symbol']} / Local News article: {ml_idx}" )
t_url = urlparse(sn_row['url']) # WARN: a rlparse() url_named_tupple (NOT the raw url)
uhint, uhdescr = self.mlnlp_uh.uhinter(0, t_url)
thint = (sn_row['thint']) # the hint we guessed at while interrogating page <tags>
logging.info ( f"%s - Logic.#0 Hints for url: [ t:0 / u:{uhint} / h: {thint} ] / {uhdescr}" % cmi_debug )
# WARNING : This is a deep analysis on the page
r_uhint, r_thint, r_xturl = self.yfn.interpret_page(ml_idx, sn_row) # go deep, with everything we knonw about this item
logging.info ( f"%s - Inferr conf: {r_xturl}" % cmi_debug )
p_r_xturl = urlparse(r_xturl)
inf_type = self.mlnlp_uh.confidence_lvl(thint) # returned var is a tupple - (descr, locality code)
#
print ( f"Article type: 0 / {sn_row['url']}" ) # all type 0 are assumed to be REAL news
print ( f"Origin URL: [ {t_url.netloc} ] / {uhdescr} / {inf_type[0]} / ", end="" )
print ( f"{locality_code.get(inf_type[1])}" )
uhint, uhdescr = self.mlnlp_uh.uhinter(21, p_r_xturl)
print ( f"Target URL: [ {p_r_xturl.netloc} ] / {uhdescr} / ", end="" )
print ( f"{locality_code.get(uhint)} [ u:{uhint} ]" )
return thint # what this artuicle actuall;y is
elif sn_row['type'] == 1: # Micro-Ad, but could possibly be news...
print( f"{sn_row['symbol']} / Fake News stub micro article: {ml_idx}" )
t_url = urlparse(sn_row['url'])
uhint, uhdescr = self.mlnlp_uh.uhinter(1, t_url) # hint on ORIGIN url
thint = (sn_row['thint']) # the hint we guess at while interrogating page <tags>
logging.info ( f"%s - Logic.#1 hint origin url: t:1 / u:{uhint} / h: {thint} {uhdescr}" % cmi_debug )
# WARN:
# deep page analysis, with everything we know about this item
# # prepare to extract Text for M/L NLP Gen AI sentiment analysis
# just interprets data for 1 page only
r_uhint, r_thint, r_xturl = self.yfn.interpret_page(ml_idx, sn_row)
logging.info ( f"%s - Logic.#1 hint ext url: {r_xturl}" % cmi_debug )
p_r_xturl = urlparse(r_xturl)
inf_type = self.mlnlp_uh.confidence_lvl(thint)
# summary report...
print ( f"NLP candidate article type: 1" )
print ( f"Origin URL: [ {t_url.netloc} ] / {uhdescr} / {inf_type[0]} / ", end="" )
print ( f"{locality_code.get(inf_type[1], 'in flux')}" )
uhint, uhdescr = self.mlnlp_uh.uhinter(31, p_r_xturl) # hint on TARGET url
print ( f"Target URL: [ {p_r_xturl.netloc} ] / {uhdescr} / ", end="" )
print ( f"{locality_code.get(uhint, 'in flux')} [ u:{uhint} ]" )
return thint
elif sn_row['type'] == 2: # possibly not news? (Micro Ad)
print ( f"Video story type: 2 - NOT an NLP candidate" )
logging.info ( f"%s - #3 skipping..." % cmi_debug )
thint = (sn_row['thint'])
return thint
elif sn_row['type'] == 9: # possibly not news? (Micro Ad)
print ( f"Article type NOT yet define - NOT an NLP candidate" )
logging.info ( f"%s - #3 skipping..." % cmi_debug )
thint = (sn_row['thint'])
return thint
else:
print ( f"ERROR unknown article type" )
logging.info ( f"%s - #4 skipping..." % cmi_debug )
thint = (sn_row['thint'])
return thint