functions.py

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as soup
from newsapi import NewsApiClient
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from datetime import datetime
from flashtext import KeywordProcessor

# Ensure all NLTK components are downloaded once
nltk.download(['vader_lexicon', 'stopwords'])

# Set user agent for all HTTP requests
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

def get_newsapi_articles(company_ticker, search_date):
    newsapi = NewsApiClient(api_key='your_api_key_here')
    articles = newsapi.get_everything(q=company_ticker, from_param=search_date, language="en", sort_by="publishedAt")
    df_newsapi = pd.DataFrame(articles['articles'], columns=['publishedAt', 'title', 'url'])
    df_newsapi.rename(columns={'publishedAt': 'datetime', 'title': 'news_headline'}, inplace=True)
    df_newsapi['datetime'] = pd.to_datetime(df_newsapi['datetime'])
    df_newsapi['source'] = 'News API'
    return df_newsapi

def get_finviz_news(company_ticker):
    url = f"http://finviz.com/quote.ashx?t={company_ticker.lower()}"
    response = requests.get(url, headers=HEADERS)
    webpage = response.content
    html_soup = soup(webpage, "html.parser")
    table = html_soup.find('table', attrs={'class': 'fullview-news-outer'})
    rows = table.findAll('tr') if table else []
    news_data = []
    for row in rows:
        datetime_str, headline = row.text.split('\n')[1:3]
        link = row.find('a')['href']
        news_data.append({'datetime': pd.to_datetime(datetime_str), 'news_headline': headline, 'url': link, 'source': 'Finviz'})
    return pd.DataFrame(news_data)

def merge_news_data(df1, df2):
    return pd.concat([df1, df2]).drop_duplicates(subset='url').reset_index(drop=True)

def sentiment_analysis(text):
    sia = SIA()
    return sia.polarity_scores(text)['compound']

def keyword_extraction(text, keywords):
    processor = KeywordProcessor()
    processor.add_keywords_from_list(keywords)
    found_keywords = processor.extract_keywords(text)
    return list(set(found_keywords))

def main(company_ticker, search_date):
    df_newsapi = get_newsapi_articles(company_ticker, search_date)
    df_finviz = get_finviz_news(company_ticker)
    df_news = merge_news_data(df_newsapi, df_finviz)
    df_news['sentiment'] = df_news['news_headline'].apply(sentiment_analysis)
    keywords = ['merger', 'acquisition', 'buyback', 'split', 'hire', 'hiring', 'firing', 'lay off', 'laid off']
    df_news['keywords'] = df_news['news_headline'].apply(lambda x: keyword_extraction(x, keywords))
    return df_news

if __name__ == '__main__':
    company_ticker = 'AMZN'
    search_date = '2023-01-01'
    news_data = main(company_ticker, search_date)
    print(news_data)