#Below is the list of libraries that were used
import csv
import numpy as np
import pandas as pd 
import math

#Plotting
import seaborn as sns; sns.set(style="ticks", color_codes=True)
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

#NLTK toolkit
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('punkt')

#Sklearn 
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LassoCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import PowerTransformer

from scipy.stats import ttest_ind
import statsmodels.api as sm
from statsmodels.api import OLS
import datetime
from datetime import timedelta


import tensorflow as tf

print(tf.__version__)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Yoyo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yoyo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

2.0.0


tweets = pd.read_csv('tweets.csv') #president tweets 
tweets = tweets.rename(columns={'created_at':'Date'})
tweets['text'] = tweets['text'].apply(str) #convert to string for NLTK
tweets['hour'] = pd.to_datetime(tweets['Date'], errors='coerce').dt.hour
tweets['Date'] = pd.to_datetime(tweets['Date'], errors='coerce').dt.date #convert dates from str to regular dates
tweets = tweets[tweets['is_retweet'] == 'false'].reset_index(drop=True) #remove retweets
display(tweets.head())


#CBOE = pd.read_csv('^VIX.csv') #SNP500 volatility index
CBOE = pd.read_csv('^GSPC.csv') #SNP500 index
#CBOE = pd.read_csv('GOVT.csv') #SNP500 volatility index
CBOE['Date'] = pd.to_datetime(CBOE['Date']).dt.date #convert dates from str to date
CBOE.head()


CBOE['Date'] = CBOE['Date']-timedelta(days=1)


CBOE.head()


#moving average
CBOE['moving_avg'] = CBOE['Close'].rolling(window=28, center = True).mean()
#daily change vs moving average
CBOE['daily_change_close'] = (CBOE['Close']-CBOE['Close'].rolling(window=28).mean())/CBOE['Close'].rolling(window=28).mean()*100
#daily high or low vs moving avarege
CBOE['daily_change_high'] = (CBOE['High']-CBOE['Close'].rolling(window=28).mean())/CBOE['Close'].rolling(window=28).mean()*100
CBOE['daily_change_low'] = (CBOE['Low']-CBOE['Close'].rolling(window=28).mean())/CBOE['Close'].rolling(window=28).mean()*100
#daily range vs. the opening price
CBOE['daily_change_low_high'] = (CBOE['High']-CBOE['Low'])/CBOE['Open']*100

CBOE = CBOE.dropna()
CBOE.tail()


#plot variables    
x = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['Date']
y1 = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['moving_avg']
y2 = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['Adj Close']  
#plot
plt.ylabel("Indicator value")
plt.xlabel("Date")
plt.title('Moving average vs daily closing value of volativlity indicator')
plt.plot(x, y2, marker = 'o' , label = 'Adj Close', color='green', alpha=0.5)
plt.plot(x, y1, label = 'moving_avg', color='black', alpha=1)
#plt.plot(x, y_train, marker='o', label = 'train set', color='green')
plt.xticks(rotation='vertical')
plt.legend()
plt.show()

C:\Users\Yoyo\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\converter.py:103: FutureWarning: Using an implicitly registered datetime converter for a matplotlib plotting method. The converter was registered by pandas on import. Future versions of pandas will require you to explicitly register matplotlib converters.

To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()
  warnings.warn(msg, FutureWarning)


#plot variables    
x = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['Date']
y1 = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['daily_change_close']
y2 = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['daily_change_high']
y3 = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['daily_change_low']
y4 = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['daily_change_low_high']
#y4 = [Y2 if max(abs(Y2), abs(Y3)) == Y2 else Y3 for Y2, Y3 in zip(y2, y3)]
y5 = y2-y3
#plot
plt.ylabel("Daily change (%)")
plt.xlabel("Date")
plt.title('Daily change of volativlity indicator over time')
plt.plot(x, y1, marker='o', label = 'Daily change (closing price vs. moving average)', color='orange', alpha = 0.5)
plt.plot(x, y4, label = 'Daily range (high to low range vs. opening price)', color='blue', alpha = 0.7)
#plt.plot(x, y3, label = 'daily change - low', color='green', alpha = 0.5)
#plt.plot(x, y_train, marker='o', label = 'train set', color='green')
plt.xticks(rotation='vertical')
plt.legend()
plt.show()


import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#from nltk.tokenize import word_tokenize
#from nltk.corpus import opinion_lexicon

import matplotlib.pyplot as plt
%matplotlib inline
nltk.download('vader_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Yoyo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yoyo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True


#Sentiment analysis of the tweets
sid = SentimentIntensityAnalyzer()
scores = pd.DataFrame()
for text in tweets['text']:
    scores = pd.concat([scores, pd.DataFrame.from_dict(sid.polarity_scores("'''"+text+"'''"), orient='index')], axis=1)

scores = scores.T.reset_index(drop=True)
scores.head()


tweets = pd.concat([tweets, scores[['neg', 'neu', 'pos', 'compound']]], axis=1) #concatenating sentiment analysis to presidents database
#tweets.head()


tweets['length'] = tweets['text'].apply(lambda x: len(x.split())) #add word count feature


#importa nltk packages
import nltk.corpus  
from nltk.text import Text 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer


#nltk.download()
assert(nltk.download('wordnet'))

def syn(word, lch_threshold=2.5):
    
    '''
    the funciton creates the list of synonyms with financial context from WordNet
    
    The function received the word variable and a threshod to determine word similarity
    and returns the list of synonyms of the word
    ''' 

    lemas = []#np.empty(shape=[0, 2])
    for net1 in wn.synsets(word):
        for net2 in wn.all_synsets():
            try:
                lch = net1.lch_similarity(net2)
            except:
                continue
            if type(lch) == float:
                if lch >= lch_threshold:
                    lemas.append(net2.lemma_names()[0])
    return lemas

#Run function and create the list of financial keywords
financial_list = []
for financial in ['money', 'capital', 'financial_market', 'federal_reserve' 'trade']:
    financial_list.extend(syn(financial))

financial_list

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Yoyo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

['standard',
 'appropriation',
 'pork_barrel',
 'fund',
 'mutual_fund',
 'revolving_fund',
 'sinking_fund',
 'savings',
 'pension_fund',
 'war_chest',
 'slush_fund',
 'trust_fund',
 'medium_of_exchange',
 'tender',
 'deposit',
 'money',
 'boodle',
 'shinplaster',
 'subsidization',
 'token_money',
 'currency',
 'budget',
 'petty_cash',
 'property',
 'wealth',
 'money',
 'pile',
 'medium_of_exchange',
 'money',
 'sterling',
 'currency',
 'Eurocurrency',
 'cash',
 'hard_currency',
 'paper_money',
 'coinage',
 'possession',
 'share',
 'tax_base',
 'assets',
 'receivables',
 'crown_jewel',
 'deep_pocket',
 'reserve_assets',
 'sum',
 'resource',
 'intangible',
 'liquid_assets',
 'investment',
 'equity',
 'stock',
 'common_stock',
 'no-par-value_stock',
 'preferred_stock',
 'float',
 'common_stock_equivalent',
 'control_stock',
 'growth_stock',
 'hot_stock',
 'authorized_shares',
 'quarter_stock',
 'security',
 'material_resource',
 'capital',
 'venture_capital',
 'capital',
 'operating_capital',
 'seed_money',
 'funds',
 'hole_card',
 'credit',
 'overage',
 'accounts_receivable',
 'treasury_stock',
 'voting_stock',
 'watered_stock',
 'possession',
 'endowment',
 'patrimony',
 'chantry',
 'share',
 'tax_base',
 'assets',
 'receivables',
 'crown_jewel',
 'deep_pocket',
 'reserve_assets',
 'sum',
 'resource',
 'intangible',
 'liquid_assets',
 'investment',
 'equity',
 'security',
 'material_resource',
 'capital',
 'means',
 'pocketbook',
 'wherewithal',
 'capital',
 'principal',
 'funds',
 'hole_card',
 'credit',
 'overage',
 'accounts_receivable',
 'capital',
 'center',
 'county_seat',
 'county_town',
 'Camelot',
 'see',
 'seat',
 'national_capital',
 'provincial_capital',
 'state_capital',
 'Windhoek',
 'Kabul',
 'Tirana',
 'Algiers',
 'Luanda',
 'George_Town',
 "St._John's",
 'Buenos_Aires',
 'Sofia',
 'Yangon',
 'Bujumbura',
 'Phnom_Penh',
 'Yaounde',
 'Praia',
 'Bangui',
 'Colombo',
 "N'Djamena",
 'Gran_Santiago',
 'Beijing',
 'Taipei',
 'Bogota',
 'Brazzaville',
 'Kinshasa',
 'San_Jose',
 'Yamoussukro',
 'Guatemala_City',
 'Tegucigalpa',
 'San_Salvador',
 'Managua',
 'Panama_City',
 'Mexico_City',
 'Havana',
 'Port-au-Prince',
 'Santo_Domingo',
 'Kingston',
 'Bridgetown',
 'Port_of_Spain',
 'Nicosia',
 'Prague',
 'Bratislava',
 'Porto_Novo',
 'Lome',
 'Copenhagen',
 'Djibouti',
 'Roseau',
 'Malabo',
 'Oslo',
 'Stockholm',
 'Berlin',
 'Quito',
 'Asmara',
 'Addis_Ababa',
 'Suva',
 'Helsinki',
 'Athens',
 'Jerusalem',
 'Rome',
 'Bucharest',
 'Kigali',
 'Belgrade',
 'Ljubljana',
 'Zagreb',
 'Edmonton',
 'Victoria',
 'Winnipeg',
 'Fredericton',
 "Saint_John's",
 'Halifax',
 'Ottawa',
 'Toronto',
 'Charlottetown',
 'Quebec',
 'Regina',
 'Whitehorse',
 'Canberra',
 'Brisbane',
 'Sydney',
 'Melbourne',
 'Hobart',
 'Adelaide',
 'Perth',
 'Darwin',
 'Kolonia',
 'Funafuti',
 'Tarawa',
 'Port_Moresby',
 'Vienna',
 'Nassau',
 'Manama',
 'Dhaka',
 'Bruxelles',
 'Gaborone',
 'La_Paz',
 'Brasilia',
 'London',
 'Belfast',
 'Dublin',
 'Edinburgh',
 'Cardiff',
 'Cairo',
 'New_Delhi',
 'Kathmandu',
 'Lhasa',
 'Jakarta',
 'Teheran',
 'Baghdad',
 'Tokyo',
 'Amman',
 'Nairobi',
 'Kuwait',
 'Paris',
 'Libreville',
 'Banjul',
 'Accra',
 "St._George's",
 'Conakry',
 'Bissau',
 'Georgetown',
 'Amsterdam',
 'Budapest',
 'Reykjavik',
 'Pyongyang',
 'Seoul',
 'Vientiane',
 'Bayrut',
 'Maseru',
 'Monrovia',
 'Tripoli',
 'Vaduz',
 'Luxembourg-Ville',
 'Skopje',
 'Antananarivo',
 'Lilongwe',
 'Putrajaya',
 'Male',
 'Bamako',
 'Valletta',
 'Nouakchott',
 'Port_Louis',
 'Monaco-Ville',
 'Ulan_Bator',
 'Rabat',
 'Maputo',
 'Wellington',
 'Niamey',
 'Abuja',
 'Muscat',
 'Islamabad',
 'Asuncion',
 'Lima',
 'Manila',
 'Warszawa',
 'Lisbon',
 'Doha',
 'Basseterre',
 'Castries',
 'Kingstown',
 'Apia',
 'San_Marino',
 'Sao_Tome',
 'Riyadh',
 'Dakar',
 'Victoria',
 'Freetown',
 'Singapore',
 'Honiara',
 'Mogadishu',
 'Pretoria',
 'Moscow',
 'Minsk',
 'Tallinn',
 'Riga',
 'Vilnius',
 'Kishinev',
 'Kyyiv',
 'Yerevan',
 'Baku',
 'Tbilisi',
 'Astana',
 'Bishkek',
 'Dushanbe',
 'Ashkhabad',
 'Tashkent',
 'Madrid',
 'Khartoum',
 'Paramaribo',
 'Mbabane',
 'Bern',
 'Dimash',
 'Dar_es_Salaam',
 'Bangkok',
 'Tunis',
 'Ankara',
 'Kampala',
 'Abu_Dhabi',
 'Montgomery',
 'Juneau',
 'Phoenix',
 'Little_Rock',
 'Sacramento',
 'Denver',
 'Hartford',
 'Dover',
 'Washington',
 'Tallahassee',
 'Atlanta',
 'Honolulu',
 'Boise',
 'Springfield',
 'Indianapolis',
 'Des_Moines',
 'Topeka',
 'Frankfort',
 'Baton_Rouge',
 'Augusta',
 'Annapolis',
 'Boston',
 'Lansing',
 'Saint_Paul',
 'Jackson',
 'Jefferson_City',
 'Helena',
 'Lincoln',
 'Carson_City',
 'Concord',
 'Trenton',
 'Santa_Fe',
 'Albany',
 'Raleigh',
 'Bismarck',
 'Columbus',
 'Oklahoma_City',
 'Salem',
 'Harrisburg',
 'Providence',
 'Columbia',
 'Pierre',
 'Nashville',
 'Austin',
 'Salt_Lake_City',
 'Montpelier',
 'Richmond',
 'Olympia',
 'Charleston',
 'Madison',
 'Cheyenne',
 'Montevideo',
 'Port_Vila',
 'Caracas',
 'Hanoi',
 'Sana',
 'Lusaka',
 'Harare',
 'written_symbol',
 'character',
 'allograph',
 'check_character',
 'superscript',
 'subscript',
 'ASCII_character',
 'ligature',
 'capital',
 'small_letter',
 'small_capital',
 'type',
 'percent_sign',
 'asterisk',
 'dagger',
 'double_dagger',
 'letter',
 'space',
 'phonetic_symbol',
 'mathematical_symbol',
 'rune',
 'pictograph',
 'ideogram',
 'radical',
 'stenograph',
 'place',
 'center',
 'nerve_center',
 'capital',
 'government',
 'federal_government',
 'United_States_government',
 'Capital',
 'book',
 'authority',
 'curiosa',
 'formulary',
 'trade_book',
 'bestiary',
 'catechism',
 'pop-up_book',
 'storybook',
 'tome',
 'booklet',
 'textbook',
 'workbook',
 'copybook',
 'appointment_book',
 'catalog',
 'phrase_book',
 'playbook',
 'prayer_book',
 'reference_book',
 'review_copy',
 'songbook',
 'publication',
 'yearbook',
 'Das_Kapital',
 'Erewhon',
 'Utopia',
 'capital',
 'masthead',
 'crown',
 'region',
 'top',
 'head',
 'crown']


# convert financial list of words to word roots


#initiate variables
token_words = financial_list
stem_financial=[]

#initiate stemmer and lamentizer
lemmantizer = WordNetLemmatizer() 
stemmer = SnowballStemmer("english") #SnowballStemmer("english")#PorterStemmer()

#iterate over the list of words
for word in token_words:
    if len(word[0].split())>1:
        stem_financial.append(word)   

    #lamentize the list of financial words
    else:  
        if wn.synsets(word)[0].pos() == 'v':
            stem_financial.append(lemmantizer.lemmatize(word, pos="v")) 
        if wn.synsets(word)[0].pos() == 'a':
            stem_financial.append(lemmantizer.lemmatize(word, pos="a")) 
        if wn.synsets(word)[0].pos() == 'n':
            stem_financial.append(lemmantizer.lemmatize(word, pos="n")) 
        if wn.synsets(word)[0].pos() == 'r':
            stem_financial.append(stemmer.stem(word)) 

stem_financial = [word.replace("_", " ") for word in stem_financial] #replace underscores with spaces
stem_financial = list(set(stem_financial)) #remove duplicates
stem_financial

['Nassau',
 'booklet',
 'Quebec',
 'Kabul',
 'stenograph',
 'Manama',
 'written symbol',
 "Saint John's",
 'Oklahoma City',
 'boodle',
 'Tokyo',
 'treasury stock',
 'Lilongwe',
 'Columbia',
 'Dakar',
 'Regina',
 'preferred stock',
 'no-par-value stock',
 'Castries',
 'Conakry',
 'Dimash',
 'Gran Santiago',
 'wealth',
 'Brazzaville',
 'Abuja',
 'Riyadh',
 'Bridgetown',
 'Amman',
 'character',
 'Lisbon',
 'mathematical symbol',
 'Nashville',
 'pension fund',
 'Canberra',
 'patrimony',
 'Vilnius',
 'asset',
 'Kingstown',
 'asterisk',
 'head',
 'common stock',
 'reference book',
 'Baton Rouge',
 'Tirana',
 'Victoria',
 'tender',
 'Das Kapital',
 'hot stock',
 'Trenton',
 'Harrisburg',
 'Maseru',
 'Sydney',
 'Port of Spain',
 'Bayrut',
 'Sacramento',
 'Indianapolis',
 'Accra',
 'Ljubljana',
 'Astana',
 'Kingston',
 'Charlottetown',
 'deposit',
 'space',
 'Belgrade',
 'Phoenix',
 'Kathmandu',
 'share',
 'sterling',
 'Abu Dhabi',
 'United States government',
 'Djibouti',
 'deep pocket',
 'Amsterdam',
 'Warszawa',
 'San Jose',
 'Utopia',
 'venture capital',
 'phrase book',
 'nerve center',
 'Cardiff',
 'publication',
 'Zagreb',
 'yearbook',
 'Kuwait',
 'control stock',
 'San Salvador',
 'Edinburgh',
 'Charleston',
 'authorized shares',
 'chantry',
 'authority',
 'Singapore',
 'Boise',
 'slush fund',
 'appropriation',
 'investment',
 'Bamako',
 'check character',
 'Islamabad',
 'Montevideo',
 'Bismarck',
 'Dover',
 'Beijing',
 'Guatemala City',
 'Bishkek',
 'Honolulu',
 'Roseau',
 'seed money',
 'operating capital',
 'mutual fund',
 'phonetic symbol',
 'Kishinev',
 'resource',
 'Antananarivo',
 'Fredericton',
 'ideogram',
 'hole card',
 'receivables',
 'Vientiane',
 'capital',
 'Prague',
 'Oslo',
 'Tallinn',
 'pork barrel',
 'Jerusalem',
 'Ashkhabad',
 'Stockholm',
 'Baghdad',
 'radical',
 'Nicosia',
 'La Paz',
 'Georgetown',
 'Hobart',
 'Monrovia',
 'Paramaribo',
 'voting stock',
 'Mbabane',
 'Helsinki',
 'Mogadishu',
 'Tbilisi',
 'Kyyiv',
 'Minsk',
 'subscript',
 'letter',
 'endowment',
 'Malabo',
 'fund',
 'Colombo',
 'ASCII character',
 'Pretoria',
 'top',
 'Bogota',
 'Dar es Salaam',
 'Bujumbura',
 'Halifax',
 'Augusta',
 'Asmara',
 'pop-up book',
 'Columbus',
 'federal government',
 'Pyongyang',
 'Camelot',
 'double dagger',
 'pile',
 'New Delhi',
 'Darwin',
 'Windhoek',
 'Niamey',
 'Jakarta',
 'Toronto',
 'Concord',
 'Suva',
 'Eurocurrency',
 'Lhasa',
 'Brisbane',
 'principal',
 'currency',
 'money',
 'seat',
 'watered stock',
 'Kinshasa',
 'Honiara',
 'paper money',
 'state capital',
 'Managua',
 'London',
 'center',
 'Havana',
 'crown jewel',
 'Copenhagen',
 'Pierre',
 'copybook',
 'Juneau',
 'Dushanbe',
 'trade book',
 'dagger',
 'Budapest',
 'Tripoli',
 'provincial capital',
 'Port-au-Prince',
 'token money',
 'mean',
 'Riga',
 'Hartford',
 'Lusaka',
 'Port Louis',
 'Ulan Bator',
 'government',
 'small capital',
 'Berlin',
 'Dublin',
 'book',
 'masthead',
 'Harare',
 'Luxembourg-Ville',
 'Tegucigalpa',
 'crown',
 'textbook',
 'place',
 'Madrid',
 'intangible',
 'Maputo',
 'review copy',
 'Dhaka',
 'Montgomery',
 'allograph',
 'Porto Novo',
 'Kolonia',
 'Brasilia',
 'sum',
 'Kigali',
 'Helena',
 'Lima',
 'common stock equivalent',
 'Valletta',
 'sinking fund',
 'songbook',
 'Bangui',
 'Ankara',
 'Yerevan',
 'Adelaide',
 'Bratislava',
 'credit',
 'Teheran',
 'Atlanta',
 'Sao Tome',
 'workbook',
 'Port Moresby',
 'shinplaster',
 'Libreville',
 'Perth',
 'Putrajaya',
 'Reykjavik',
 'Des Moines',
 'Melbourne',
 'Male',
 'Frankfort',
 'Lincoln',
 "N'Djamena",
 'ligature',
 'small letter',
 'petty cash',
 'Salt Lake City',
 'Annapolis',
 'stock',
 'storybook',
 'tax base',
 'Bern',
 'Yamoussukro',
 'Cheyenne',
 'possession',
 'Nouakchott',
 'Capital',
 'Praia',
 'Yaounde',
 'subsidization',
 'Buenos Aires',
 'Bissau',
 'Banjul',
 'see',
 'Rome',
 'pocketbook',
 'overage',
 'appointment book',
 'Moscow',
 "St. George's",
 'Hanoi',
 'security',
 'Madison',
 'Edmonton',
 'Muscat',
 'Port Vila',
 'hard currency',
 'Khartoum',
 'Saint Paul',
 'equity',
 'Freetown',
 'Paris',
 'growth stock',
 'medium of exchange',
 'county town',
 'Erewhon',
 'Addis Ababa',
 'accounts receivable',
 'Tunis',
 'trust fund',
 'Seoul',
 'prayer book',
 'Bucharest',
 'region',
 'playbook',
 'reserve assets',
 'superscript',
 'Panama City',
 "St. John's",
 'Luanda',
 'Boston',
 'Raleigh',
 'tome',
 'Tallahassee',
 'Taipei',
 'national capital',
 'Richmond',
 'Little Rock',
 'Springfield',
 'Baku',
 'Vaduz',
 'material resource',
 'Monaco-Ville',
 'Bangkok',
 'standard',
 'Topeka',
 'cash',
 'Providence',
 'percent sign',
 'property',
 'Austin',
 'Phnom Penh',
 'Belfast',
 'saving',
 'Sana',
 'Caracas',
 'Olympia',
 'quarter stock',
 'Sofia',
 'Athens',
 'Whitehorse',
 'bestiary',
 'Skopje',
 'Salem',
 'Santo Domingo',
 'rune',
 'war chest',
 'Manila',
 'Wellington',
 'catalog',
 'Rabat',
 'revolving fund',
 'Winnipeg',
 'Albany',
 'Asuncion',
 'curiosa',
 'Nairobi',
 'budget',
 'Gaborone',
 'formulary',
 'Doha',
 'George Town',
 'county seat',
 'Jefferson City',
 'Lansing',
 'Kampala',
 'Montpelier',
 'Tashkent',
 'float',
 'Lome',
 'type',
 'Vienna',
 'Jackson',
 'catechism',
 'San Marino',
 'Quito',
 'coinage',
 'Yangon',
 'Ottawa',
 'pictograph',
 'Apia',
 'Algiers',
 'Mexico City',
 'Basseterre',
 'Funafuti',
 'liquid assets',
 'Denver',
 'wherewithal',
 'Carson City',
 'Washington',
 'Tarawa',
 'Santa Fe',
 'Bruxelles',
 'Cairo']


def is_financial_tweet(text, financial_words):
    
    #financial_words = ['finac', 'federal reserve', 'interest', 'negative', 'trade', 'market']
    count_financial = 0
    #count the number of time financial words appear in Trump's tweets
    for word in financial_words:
        count_financial += text.lower().count(word)
    return count_financial
#    return 1 if any(f in text.lower() for f in financial_words) else 0

def num_upper(text):
    return sum(map(str.isupper, text.split()))

tweets['num_upper'] = tweets['text'].apply(num_upper)
tweets['is_financial'] = tweets['text'].apply(lambda x: is_financial_tweet(x, stem_financial))
tweets['country_mentioned'] = tweets['text'].apply(lambda x: is_financial_tweet(x, ['china', 'iran', 'russia', 'israel', 'turkey', 'britain', 'france', 'ukraine', 'japan', 'brasil']))
tweets.head()


nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import RegexpTokenizer


def tag_count (tweet, NN, VB, RB, JJ):
    '''
    Returns the number of word types in a tweet
    
    tweet: is a string
    tag_type: is a string of two initial letters of tag type (NN, VB, etc.)
    '''
    
    tokenizer = RegexpTokenizer(r'\w+')
    string = tokenizer.tokenize(tweet) #remove punctuation
    
    string = pd.DataFrame(string)
    string = string[string[0].apply(lambda x: len(x)>3)][0].tolist() #remove words short words

    #count nouns
    tokens = nltk.word_tokenize(' '.join(string))
    #print('\n', list(pd.DataFrame(nltk.pos_tag(tokens))))
    tags = pd.DataFrame(nltk.pos_tag(tokens))
    if list(tags) != []:
        tags = tags.groupby(1).count() #group by word tag
        tags = tags.reset_index()

        tags[1] = tags[1].apply(lambda x: x[0:2]) #select word tag type
        group_count = tags.groupby(1).sum().reset_index() #group by tag type

        #return number of nouns in a string
        #print('\n', group_count[group_count[1] == NN][0].values, group_count[group_count[1] == VB][0].values, group_count[group_count[1] == RB][0].values)
        nouns = [group_count[group_count[1] == NN][0].values 
                 if len(group_count[group_count[1] == NN][0].values)>0 else [0]][0][0]
        verbs = [group_count[group_count[1] == VB][0].values 
                 if len(group_count[group_count[1] == VB][0].values)>0 else [0]][0][0]
        adverbs = [group_count[group_count[1] == RB][0].values 
                 if len(group_count[group_count[1] == RB][0].values)>0 else [0]][0][0]
        adjectives = [group_count[group_count[1] == JJ][0].values 
                 if len(group_count[group_count[1] == JJ][0].values)>0 else [0]][0][0]
        #print('\n', nouns, verbs, adverbs)
        return nouns, verbs, adverbs, adjectives
    else:
        return 0, 0, 0, 0

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Yoyo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


tweets=tweets.join(pd.DataFrame(tweets['text'].apply(lambda x: tag_count(x, 'NN', 'VB', 'RB', 'JJ')).tolist(),
                                                    columns = ['nouns', 'verbs', 'adverbs', 'adjectives']))
#tweets.to_csv('tweets_final.csv')
tweets.head()


#Create df with average values

bars_df = tweets[tweets['Date']>datetime.date(2016, 11, 1)].copy()
bars_df['year-month']= bars_df['Date'].map(lambda x: str(x.year) +'-'+ str(x.month))

bars_df = bars_df.loc[:, ['year-month', 'neg', 'pos', 'neu']].groupby(['year-month']).mean().reset_index().T
bars_df.columns = bars_df.iloc[0]
bars_df = bars_df.drop(['year-month'], axis =0)
bars_df


#Create df with STDEV values

bars_df_e = tweets[tweets['Date']>datetime.date(2016, 11, 1)].copy()
bars_df_e['year-month']= bars_df_e['Date'].map(lambda x: str(x.year) +'-'+ str(x.month))

bars_df_e = bars_df_e.loc[:, ['year-month', 'neg', 'pos', 'neu']].groupby(['year-month']).std().reset_index().T
bars_df_e.columns = bars_df_e.iloc[0]
bars_df_e = bars_df_e.drop(['year-month'], axis =0)

bars_df_e


# your code here

plt.figure(figsize=(30, 15)) #figure size
N=len(list(bars_df))

#coefficient variables
neg_bar = bars_df.loc[['neg'], :].values.flatten()
pos_bar = bars_df.loc[['pos'], :].values.flatten()
neg_bar_e = bars_df_e.loc[['neg'], :].values.flatten()
pos_bar_e = bars_df_e.loc[['pos'], :].values.flatten()

#coefficient magnitudes
bars = np.add(neg_bar, pos_bar).tolist()

ind = np.arange(N)    # the x locations for the groups
width = 0.75       # the width of the bars

p1 = plt.bar(ind, neg_bar, width, color='r',edgecolor='white', yerr=[neg_bar_e/1000, neg_bar_e/2])
p2 = plt.bar(ind, pos_bar, width, bottom = neg_bar, color='b', edgecolor='white', yerr=[pos_bar_e/1000, pos_bar_e/2])


plt.ylabel('Predictor mean value\n *error bars represent STDEV', fontsize=25)
plt.title('The ratio of pisitive and negative components of Trump\'s tweets', fontsize=27)
plt.xticks(ind, list(bars_df), rotation='vertical', fontsize=25)
plt.yticks(fontsize=22)
plt.legend((p1[0], p2[0]), ('negative', 'positive'), loc="upper left", fontsize=25)

plt.show()


#Create df with average values
bars_df = tweets[tweets['Date']>datetime.date(2016, 11, 1)].copy()
bars_df['year-month']= bars_df['Date'].map(lambda x: str(x.year) +'-'+ str(x.month))
bars_df['descriptive']=bars_df['adverbs']+bars_df['adjectives']
bars_df = bars_df.loc[:, ['year-month', 'nouns', 'verbs', 'descriptive']].groupby(['year-month']).mean().reset_index().T
bars_df.columns = bars_df.iloc[0]
bars_df = bars_df.drop(['year-month'], axis =0)
bars_df


#Create df with STDEV values

bars_df_e = tweets[tweets['Date']>datetime.date(2016, 11, 1)].copy()
bars_df_e['year-month']= bars_df_e['Date'].map(lambda x: str(x.year) +'-'+ str(x.month))
bars_df_e['descriptive']=bars_df_e['adverbs']+bars_df_e['adjectives']
bars_df_e = bars_df_e.loc[:, ['year-month', 'nouns', 'verbs', 'descriptive']].groupby(['year-month']).std().reset_index().T
bars_df_e.columns = bars_df_e.iloc[0]
bars_df_e = bars_df_e.drop(['year-month'], axis =0)

bars_df_e


# your code here

plt.figure(figsize=(30, 15)) #figure size
N=len(list(bars_df))

#coefficient variables
nouns_bar = bars_df.loc[['nouns'], :].values.flatten()
verbs_bar = bars_df.loc[['verbs'], :].values.flatten()
adverbs_bar = bars_df.loc[['descriptive'], :].values.flatten()

nouns_bar_e = bars_df_e.loc[['nouns'], :].values.flatten()
verbs_bar_e = bars_df_e.loc[['verbs'], :].values.flatten()
adverbs_bar_e = bars_df_e.loc[['descriptive'], :].values.flatten()


#coefficient magnitudes
bars = np.add(nouns_bar, verbs_bar).tolist()

ind = np.arange(N)    # the x locations for the groups
width = 0.75       # the width of the bars

p1 = plt.bar(ind, nouns_bar, width, color='r',edgecolor='white', yerr=[nouns_bar_e/1000, nouns_bar_e/2])
p2 = plt.bar(ind, verbs_bar, width, bottom = nouns_bar, color='b', edgecolor='white', yerr=[verbs_bar_e/1000, verbs_bar_e/2])
p3 = plt.bar(ind, adverbs_bar, width, bottom = bars, color='c', edgecolor='white', yerr=[adverbs_bar_e/1000, adverbs_bar_e/2])

plt.ylabel('Predictor mean value\n *error bars represent STDEV', fontsize=25)
plt.title('The ratio of nouns verbs and descriptive words in Trump\'s tweets', fontsize=27)
plt.xticks(ind, list(bars_df), rotation='vertical', fontsize=22)
plt.yticks(fontsize=22)
plt.legend((p1[0], p2[0], p3[0]), ('nouns', 'verbs', 'descriptive'), loc="upper left", fontsize=25)

plt.show()


tweets['favorite_count'] = pd.to_numeric(tweets['favorite_count'])
tweets['retweet_count'] = pd.to_numeric(tweets['retweet_count'])


tweets_count = pd.DataFrame({'tweet_count':tweets['Date'].value_counts(dropna=False)}).rename_axis('Date').reset_index()
tweets_count.head()


tweets_by_date = tweets.drop(['source','text','is_retweet', 'id_str'], axis = 1).groupby(['Date']).mean().reset_index()
tweets_by_date = tweets_by_date.merge(tweets_count, on='Date', how='left')
tweets_by_date.tail(20)


tweets_by_date = tweets_by_date[tweets_by_date['Date']>datetime.date(2016, 11, 1)]
tweets_by_date = tweets_by_date[tweets_by_date['Date']<datetime.date(2019, 9, 26)]
CBOE = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]
CBOE = CBOE[CBOE['Date']<datetime.date(2019, 9, 26)]


datamerge = tweets_by_date.merge(CBOE, on='Date', how='outer')

datamerge = datamerge.drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1)
datamerge = datamerge[datamerge['Date']<datetime.date(2019, 9, 26)]

#Impute Staurday and Sunday values
datamerge['daily_change_low'] = datamerge['daily_change_low'].fillna(datamerge['daily_change_low'].rolling(2, min_periods=1).mean().shift(-2))
datamerge['daily_change_high'] = datamerge['daily_change_high'].fillna(datamerge['daily_change_high'].rolling(2, min_periods=1).mean().shift(-2))
datamerge['daily_change_close'] = datamerge['daily_change_close'].fillna(datamerge['daily_change_close'].rolling(2, min_periods=1).mean().shift(-2))
datamerge['daily_change_low_high'] = datamerge['daily_change_low_high'].fillna(datamerge['daily_change_low_high'].rolling(2, min_periods=1).mean().shift(-2))
datamerge['Adj Close'] = datamerge['Adj Close'].fillna(datamerge['Adj Close'].rolling(2, min_periods=1).mean().shift(-2))
datamerge['moving_avg'] = datamerge['moving_avg'].fillna(datamerge['moving_avg'].rolling(2, min_periods=1).mean().shift(-2))

#Define days with missing tweets as 1
datamerge['no_tweet'] = datamerge['length'].map(lambda x: 1 if np.isnan(x) else 0)

#Impute of missing feature values as 0
#datamerge[datamerge['length'].isna().any(axis=1)] = datamerge[datamerge['length'].isnull()].fillna(0)
datamerge[datamerge['length'].isna()] = datamerge[datamerge['length'].isna()].fillna(0)

datamerge[datamerge['retweet_count'].isna()] = datamerge[datamerge['retweet_count'].isna()].fillna(0)
#print('Numpber of NaN tweet rows after imputation:', len(datamerge[datamerge['daily_change_low'].isna()]))
#print('Numpber of NaN tweet rows after imputation:', len(datamerge[datamerge.isna().any(axis=1)]))

datamerge = datamerge.sort_values(by='Date')

#drop NaN values that are not related to imputed values
datamerge = datamerge.dropna(axis=0, subset=['daily_change_close'])
datamerge = datamerge.dropna(axis=0, subset=['daily_change_low'])
datamerge = datamerge.dropna(axis=0, subset=['daily_change_high'])

datamerge.tail()


datamerge['daily_change_low_high'].median()

0.6387194469741241


type(datamerge['daily_change_low_high'])

pandas.core.series.Series


from sklearn.preprocessing import power_transform


#Set threshold for daily change variable

#datamerge['change'] = (datamerge['daily_change_low_high'].map(lambda x: 1 if (abs(x) > datamerge['daily_change_low_high'].abs().median()) else 0))
datamerge['change'] = (datamerge['daily_change_low_high'].map(lambda x: 1 if (x > datamerge['daily_change_low_high'].median()) else 0))
#temp_tr = pd.Series(power_transform(datamerge['daily_change_low_high'].values.reshape(-1,1), method='yeo-johnson', standardize=False).reshape(1,-1)[0])
#datamerge['change'] = temp_tr.map(lambda x: 1 if (abs(x) > temp_tr.abs().mean()) else 0)
#datamerge['change'] = datamerge['daily_change'].map(lambda x: 1 if x > 1.7 else 0)

datamerge.head()


### Figures of Trump's tweets over time


# Standardize the data attributes for the Iris dataset.
from sklearn.datasets import load_iris
from sklearn import preprocessing

# standardize the data attributes
preprocessing.scale(y2)

array([-1.91014032, -1.92246276, -1.85169447, -1.94010655, -1.89815383,
       -1.91714024, -1.89575653, -1.85629709, -1.88305131, -1.90328437,
       -1.88971529, -1.86531154, -1.95621691, -1.95938092, -2.00938824,
       -1.9182909 , -1.85634508, -1.86473563, -1.82647484, -1.86521556,
       -1.86521556, -1.83443345, -1.85783169, -1.83769462, -1.87005814,
       -1.85087976, -1.89019522, -1.85365983, -1.88295533, -1.81165912,
       -1.72391776, -1.73202152, -1.74156272, -1.8076804 , -1.8174124 ,
       -1.8141524 , -1.80791919, -1.72847357, -1.75177465, -1.74928137,
       -1.74165871, -1.6783691 , -1.63890966, -1.58065578, -1.53592182,
       -1.47996925, -1.48970242, -1.47081199, -1.40263297, -1.41490625,
       -1.41015964, -1.39323457, -1.38168002, -1.41097552, -1.25601383,
       -1.32333015, -1.3175757 , -1.35502178, -1.38820118, -1.41413953,
       -1.40507709, -1.36801495, -1.36384426, -1.40229702, -1.30731578,
       -1.32591942, -1.34092595, -1.36384426, -1.50504489, -1.48380516,
       -1.49574366, -1.50523687, -1.51669544, -1.43528328, -1.42301   ,
       -1.3897826 , -1.41538617, -1.43398864, -1.42766062, -1.46222948,
       -1.44046182, -1.44981105, -1.44204441, -1.45824959, -1.50068223,
       -1.5772998 , -1.4811199 , -1.51381938, -1.53309375, -1.44837243,
       -1.48265451, -1.36058426, -1.29125341, -1.29681589, -1.29048669,
       -1.31239833, -1.29259604, -1.27898013, -1.29355589, -1.28689075,
       -1.24004745, -1.23961668, -1.2514112 , -1.23841803, -1.26330171,
       -1.28027477, -1.22551968, -1.23343146, -1.44266715, -1.40100238,
       -1.32424085, -1.26531506, -1.24421931, -1.2155957 , -1.16438857,
       -1.16079262, -1.17474565, -1.18006699, -1.09251761, -1.04931825,
       -1.06355806, -1.09601758, -1.07775105, -1.07463386, -1.08431904,
       -1.09573079, -1.04318102, -1.05483274, -1.08101105, -1.07770306,
       -0.98032452, -1.05909942, -1.06590737, -1.07122988, -1.05301018,
       -1.04931825, -1.14372475, -1.04155161, -1.14219015, -1.12440238,
       -1.09755217, -1.08062711, -1.18989615, -1.11591585, -1.10512801,
       -1.11423728, -1.02927716, -1.00731754, -0.95246763, -0.95309154,
       -0.94604245, -0.88265802, -0.88448058, -0.88884324, -0.90145364,
       -0.86683679, -0.86347964, -0.87503536, -0.89095258, -0.89958309,
       -0.87057554, -0.86472628, -0.8906658 , -0.86827423, -0.84871308,
       -0.87743266, -0.88174733, -1.05344212, -1.03853041, -0.9209668 ,
       -0.92686407, -0.91008298, -1.09275758, -1.11414129, -1.1006202 ,
       -0.98487915, -1.02548925, -1.04979818, -1.03023585, -1.02453056,
       -1.01465341, -0.96052223, -0.89311109, -0.86961686, -0.95927558,
       -0.92240542, -0.92451476, -0.94211172, -0.81419105, -0.77406088,
       -0.76499844, -0.77818358, -0.75608114, -0.73862815, -0.72530021,
       -0.71767638, -0.75430658, -0.74653994, -0.77319818, -0.77233431,
       -0.72342966, -0.70894988, -0.6643599 , -0.61756459, -0.59138628,
       -0.57623577, -0.50752883, -0.52066598, -0.54272159, -0.51438594,
       -0.49233033, -0.51299532, -0.50225547, -0.48082377, -0.47257603,
       -0.46346677, -0.45943888, -0.39658238, -0.44563101, -0.4257339 ,
       -0.48317308, -0.46759064, -0.36848553, -0.40799296, -0.39634241,
       -0.3766841 , -0.37433479, -0.33602601, -0.32025277, -0.32260208,
       -0.30467033, -0.35146448, -0.36258827, -0.3504098 , -0.37903341,
       -0.44735641, -0.34657506, -0.37912939, -0.36335615, -0.28237476,
       -0.29172399, -0.26612159, -0.27091619, -0.14807806, -0.15272868,
       -0.04959686, -0.07529641, -0.08862553, -0.13594759, -0.1373862 ,
       -0.10042005, -0.03080241,  0.00990366,  0.02965796,  0.02361673,
       -0.02835712,  0.08575451,  0.15455627,  0.11289151,  0.10224764,
        0.12775522,  0.12185797,  0.10824089,  0.118406  ,  0.14199504,
        0.07520663,  0.18164645,  0.26435324,  0.31675785,  0.4086219 ,
        0.43048554,  0.44765057,  0.43297882,  0.52565876,  0.61522149,
        0.56813823,  0.69346964,  0.67174997,  0.73057977,  0.8392729 ,
        0.86880719,  0.86118453,  0.86938311,  1.03057801,  0.93785009,
        0.78873767,  0.79535482,  0.78658033,  0.49962325, -0.04307686,
        0.17843327,  0.1138022 , -0.36882148, -0.18398954, -0.00922673,
        0.02404749,  0.19516637,  0.3513267 ,  0.35621729,  0.27969571,
        0.20811271,  0.22072194,  0.42852018,  0.58338589,  0.4140404 ,
        0.26804517,  0.09467181,  0.1597828 ,  0.3021341 ,  0.33656014,
        0.33023095,  0.38858082,  0.61680408,  0.59978303,  0.51487091,
        0.43897207,  0.42866416,  0.45110254,  0.26368134,  0.28295571,
        0.25893474, -0.0682485 , -0.33401266,  0.00299973, -0.21721577,
       -0.25375116, -0.08176841, -0.36460279, -0.20844245, -0.0634539 ,
        0.02356874, -0.25629243, -0.21462767, -0.00505487, -0.07544039,
        0.02908205, -0.00778811,  0.09548768,  0.23237248,  0.24316033,
        0.16879608,  0.05856836,  0.05928825, -0.1120226 , -0.0888175 ,
        0.04322588,  0.05746569, -0.04734354, -0.01498001, -0.10670125,
       -0.13518088,  0.02634881,  0.07050685,  0.06710288,  0.19113965,
        0.3123472 ,  0.3346416 ,  0.34619616,  0.25663342,  0.30942198,
        0.29825019,  0.2639213 ,  0.36000521,  0.31891518,  0.36134783,
        0.33483358,  0.30400464,  0.15311883,  0.31685384,  0.22700314,
        0.36772502,  0.42645883,  0.43571208,  0.54862506,  0.53913185,
        0.58065264,  0.59489245,  0.61814671,  0.56435031,  0.59724175,
        0.58367267,  0.55533703,  0.50182975,  0.5245081 ,  0.4403147 ,
        0.46486243,  0.28357962,  0.31229921,  0.19996096,  0.27993568,
        0.28981283,  0.32979901,  0.26511995,  0.37726622,  0.48854862,
        0.60529635,  0.6516609 ,  0.55663166,  0.67299662,  0.68747639,
        0.67366735,  0.72698382,  0.75608735,  0.70272288,  0.68996967,
        0.71466138,  0.77900449,  0.90208258,  0.8607046 ,  0.77142982,
        0.69366161,  0.75929936,  0.74525151,  0.81170397,  0.87465763,
        0.92284239,  0.96143913,  0.95784318,  0.93809005,  0.8407595 ,
        0.78634037,  0.87278708,  0.76927249,  0.87628704,  0.92154776,
        0.95472716,  0.98306281,  0.97759748,  0.95439121,  1.03930333,
        1.14502442,  1.14876435,  1.22797118,  1.16607219,  1.16794274,
        1.14492844,  1.10599692,  1.05541369,  1.02487155,  1.05100187,
        1.10259177,  1.10753035,  1.18069594,  1.18453185,  1.10695561,
        1.18131985,  1.19877166,  1.3080887 ,  1.30291016,  1.25352675,
        1.23525906,  1.18927846,  1.22777921,  1.22768322,  1.27855441,
        1.27299193,  1.28296506,  1.16837467,  1.09146915,  1.08600266,
        1.06639351,  0.61253624,  0.33775879,  0.52359624,  0.44525327,
        0.72875721,  0.72535324,  0.53150802,  0.52671342,  0.46965702,
        0.39682738, -0.0087468 ,  0.22844175,  0.00367045, -0.07994703,
        0.11845282,  0.25802404,  0.39529395,  0.31229921,  0.3854168 ,
        0.46759567,  0.74779162,  0.71394266,  0.59014585,  0.32744971,
        0.30807935,  0.20931136,  0.34653211,  0.37563564,  0.15728952,
       -0.076879  , -0.03833026, -0.12161179,  0.07443875,  0.11624749,
        0.4116911 ,  0.38297151,  0.49022603,  0.63502378,  0.20202348,
        0.18231717, -0.11911851, -0.09687209, -0.10137874, -0.03286376,
       -0.03540504, -0.2779641 , -0.53692032, -0.53586565, -0.7238136 ,
       -0.91339214, -1.15695671, -1.47109877, -0.91204951, -0.81073907,
       -0.8255548 , -0.72434035, -0.70909386, -1.00703075, -0.60404467,
       -0.51894058, -0.40041829, -0.34983506, -0.2938345 , -0.29565588,
       -0.36110166, -0.22833956, -0.20053066, -0.10531063,  0.0613016 ,
       -0.11998238, -0.09217348, -0.07476849,  0.03277399, -0.06748061,
       -0.08594027,  0.11087815,  0.22139384,  0.23304438,  0.3209777 ,
        0.38249158,  0.35329324,  0.23074307,  0.23951639,  0.24872281,
        0.41619774,  0.45599312,  0.42099233,  0.5642075 ,  0.5841526 ,
        0.60783763,  0.56075436,  0.64605042,  0.66254472,  0.65194768,
        0.6446598 ,  0.60683095,  0.69888697,  0.64672232,  0.63157064,
        0.54430922,  0.43633481,  0.40823913,  0.60112566,  0.64053711,
        0.73355181,  0.72185327,  0.78897763,  0.83912892,  0.83735553,
        0.79736817,  0.94432209,  0.68459917,  0.67333256,  0.76970326,
        0.70694273,  0.75522348,  0.84612884,  1.00334385,  1.00358382,
        1.03311811,  1.0618377 ,  1.12584603,  1.14037381,  1.05613241,
        1.10412637,  1.10465429,  1.19618239,  1.18740908,  1.19450499,
        1.16281219,  1.18477181,  1.19886765,  1.32213654,  1.29130761,
        1.28612907,  1.35186281,  1.36696649,  1.38039159,  1.27443054,
        1.24465628,  1.37947973,  1.31633527,  1.08418127,  1.06198168,
        1.02026892,  1.07147489,  0.73810761,  0.84617684,  0.92552765,
        1.04711913,  0.96661767,  0.87408172,  0.98977594,  0.95098724,
        0.78782697,  0.80614266,  0.69265376,  0.59978303,  0.6277839 ,
        0.45134251,  0.41485511,  0.69687362,  0.80657342,  0.88971215,
        1.03283133,  1.09703046,  1.09218787,  1.06399621,  1.12057151,
        1.09822911,  1.1111263 ,  1.24575894,  1.28751969,  1.42042577,
        1.40259001,  1.37809028,  1.24398438,  1.22672453,  1.28013583,
        1.36087726,  1.46909163,  1.5107084 ,  1.62007343,  1.59413391,
        1.52480423,  1.54244802,  1.60688829,  1.63968258,  1.70613621,
        1.70867748,  1.65948487,  1.56541433,  1.61666945,  1.52796942,
        1.56833954,  1.66634082,  1.73389711,  1.65771031,  1.7641033 ,
        1.74065707,  1.70330698,  1.54604397,  1.41745373,  1.31432191,
        0.89570539,  1.07324945,  1.08384532,  1.34328146,  1.25007361,
        1.0777081 ,  1.28684896,  0.87585628,  0.90941846,  1.10637969,
        1.27404659,  1.16310015,  1.27778652,  1.27069061,  0.90706915,
        1.05699511,  1.01278907,  1.10283173,  1.27850641,  1.28751969,
        1.19071707,  1.34179485,  1.5250442 ,  1.53803737,  1.53669474,
        1.54129737,  1.64457317,  1.68599913,  1.67554607,  1.63033335,
        1.66744348,  1.67238206,  1.67267002,  1.60209369,  1.60070307,
        1.47997546,  1.56757283])


#Generate the plots of Trump's tweet frequency over time

datamerge['DateEST'] = datamerge.Date
plt.figure(figsize=(16,10))
sns.distplot(datamerge['hour'])
plt.ylabel("Frequency", fontsize=24)
plt.xlabel("Date (EST Hour)", fontsize=24)
plt.title("Tweets by Hour", fontsize=24);
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)


f = tweets.groupby(['Date']).count().reset_index()
d = (f.Date >= datetime.date(year=2007,month=1,day=1))
#display(f)
plt.figure(figsize=(18,10))
plt.scatter(f['Date'], f['is_retweet'])
sns.scatterplot(x='Date', y='is_retweet', data=f[d])
plt.ylabel("Tweets per Day", fontsize=24)
plt.xlabel("Date", fontsize=24)
plt.title("Tweets per Day Over Time", fontsize=24)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

datamerge = datamerge.drop(['DateEST'], axis=1)


def violin_plot(df, dist_column, masks):
    '''
    
    Plots the effect of indicator features on stock market volatiltiy indicator
    
    Receives fianl dataset containing the predictors, response variable column, and filters by indicator variables
    
    '''
    to_plot_violins = [df[m][dist_column].values.reshape(-1,1) for (m,_,_,_) in masks]
    to_plot_violins_labels = [l for (_,l,_, _) in masks]
    positions = [p for (_,_,p,_) in masks]
    plt.figure(figsize=(18,10))
    plt.title("Market Volatiltiy Distribution by Indicator Features", fontsize=18)
    plt.ylabel("% Volatility Change (High - Low / Open)", fontsize=18)
    plt.xticks(list(positions), list(to_plot_violins_labels))

    violin_parts = plt.violinplot(dataset=to_plot_violins, positions=positions, showmedians=True, showextrema=True)

    for i, p in enumerate(violin_parts['bodies']):
        (_, _, _, c) = masks[i]
        p.set_facecolor(c)
        p.set_edgecolor(c)


    plt.xticks(fontsize=15, rotation=90)
    plt.show()

violin_plot(
    datamerge,
    'daily_change_low_high',
    masks=[
        ((datamerge.is_financial == 0), "No financial keywords", 0.0, "green"),
        ((datamerge.is_financial != 0), "Some financial keywords", 0.06, "red"),
        ((datamerge.neg > datamerge.pos), "Sentiment more negative", 0.5, "green"),
        ((datamerge.neg <= datamerge.pos), "Sentiment more positive", 0.56, "red"),
        ((datamerge.neu > datamerge.pos+datamerge.neg), "Sentiment neutral", 1.0, "green"),
        ((datamerge.neu <= datamerge.pos+datamerge.neg), "Sentiment less neutral", 1.06, "red"),
        ((datamerge.country_mentioned > 0), "Countries mentioned", 1.5, "green"),
        ((datamerge.country_mentioned == 0), "No countries mentioned", 1.56, "red"),
        ((datamerge.no_tweet == 0), "Day with tweets", 2.0, "green"),
        ((datamerge.no_tweet == 1), "Day without tweet", 2.06, "red"),
        ((datamerge.num_upper > 0), "Upper case words present", 2.5, "green"),
        ((datamerge.num_upper == 0), "No upper case words", 2.56, "red"),
        (((datamerge.hour >= 9.5) & (datamerge.hour <= 16.5)), "Tweeted during trading hours", 3.0, "green"),
        (((datamerge.hour < 9.5) | (datamerge.hour > 16.5)), "Tweeted during afterhours", 3.06, "red"),
        ((datamerge.nouns > datamerge.nouns.mean()), "Nouns above average", 3.50, "green"),
        ((datamerge.nouns < datamerge.nouns.mean()), "Nouns below average", 3.56, "red"),
        ((datamerge.no_tweet != -1), "All daily range / \nopen price data", 4.0, "Green"),
    ]
)


datamerge['res_length'] = datamerge['length']-(datamerge['nouns']+datamerge['verbs']+datamerge['adverbs']+datamerge['adjectives'])
datamerge = datamerge.drop(['length'], axis=1) #variable removed as it is collinear with nouns, verbs, adverbes and adjectives.


sns.set(font_scale=1.5)
df = datamerge[datamerge['Date']>datetime.date(2016, 11, 1)].drop(['Date'], axis=1)
sns.pairplot(data = df,
                  y_vars=['Adj Close', 'moving_avg', 'daily_change_close','daily_change_high', 'daily_change_low', 'daily_change_low_high', 'change'],
                  x_vars=list(df.drop(['Adj Close', 'moving_avg', 'daily_change_close','daily_change_high', 'daily_change_low', 'daily_change_low_high', 'change'], axis =1)), kind="reg")

<seaborn.axisgrid.PairGrid at 0x2c0faaf2b88>


sns.set(font_scale=1.5)
df = datamerge[datamerge['Date']>datetime.date(2016, 11, 1)].drop(['Date'], axis=1)
sns.pairplot(data = df,
                  y_vars=list(df.drop(['Adj Close', 'moving_avg', 'daily_change_close','daily_change_high', 'daily_change_low', 'daily_change_low_high', 'change'], axis =1)),
                  x_vars=list(df.drop(['Adj Close', 'moving_avg', 'daily_change_close','daily_change_high', 'daily_change_low', 'daily_change_low_high', 'change'], axis =1)), kind="reg")

<seaborn.axisgrid.PairGrid at 0x2c081859348>


list(datamerge)

['Date',
 'retweet_count',
 'favorite_count',
 'hour',
 'neg',
 'neu',
 'pos',
 'compound',
 'num_upper',
 'is_financial',
 'country_mentioned',
 'nouns',
 'verbs',
 'adverbs',
 'adjectives',
 'tweet_count',
 'Adj Close',
 'moving_avg',
 'daily_change_close',
 'daily_change_high',
 'daily_change_low',
 'daily_change_low_high',
 'no_tweet',
 'change',
 'res_length']


sns.set(font_scale=1)
#Split to train and test sets
X_train, X_test, y_train, y_test =train_test_split(datamerge.loc[:, list(datamerge.drop(['compound', 'Adj Close', 'moving_avg', 'daily_change_low', 'daily_change_high', 'daily_change_low_high','daily_change_close', 'Date', 'moving_avg', 'change'], axis=1))], 
                                                         datamerge.change, test_size=0.2, 
                                                         random_state = 109, 
                                                         stratify = datamerge.change)


X_train.shape

(799, 16)


from math import sqrt

t_data = datamerge.drop(['Adj Close', 'moving_avg', 'daily_change_low', 'daily_change_high', 'daily_change_low_high','daily_change_close', 'Date', 'moving_avg'], axis=1)

t_stat = pd.DataFrame()
for x in t_data.drop(['change'], axis=1).columns:
    df1 = df[df['change']==0]
    df2 = df[df['change']==1]
    mean1=df1.loc[:, x].mean()
    std1=df1.loc[:, x].std()
    n1 = df1.loc[:, x].count()
    mean2=df2.loc[:, x].mean()
    std2 = df2.loc[:, x].std()
    n2 = df2.loc[:, x].count()
    t_stat.loc[0, x] = abs(mean1-mean2)/sqrt(std1**2/n1+std2**2/n2)
    
#t_stat.T.rename(columns={0: "t-statistic"}) #calculated t-statistic for each feature:
sig_variables = t_stat.T.sort_values(0).reset_index()
sig_variables = sig_variables.rename(columns={'index':'Predictors', 0:'t-Statistic'})
sig_variables = sig_variables.sort_values(by=['t-Statistic'], ascending=False)
t_stat = t_stat.sort_values(by=0, axis=1, ascending=False)
sig_variables


plt.figure(figsize=(30, 15)) #figure size
N=len(list(t_stat))

#coefficient variables
bar = sig_variables.loc[:, ['t-Statistic']].values.flatten()


#coefficient magnitudes
#bars = np.add(neg_bar, pos_bar).tolist()

ind = np.arange(N)    # the x locations for the groups
width = 0.75       # the width of the bars

#fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15,12), sharex=True)
fig, (ax2) = plt.subplots(1, 1, figsize=(9,7), sharex=True)
#ax1.bar(ind, bar, width, color='b',edgecolor='white')
#ax1.set_xlabel('')
#ax1.set_ylim(ymin=150)
ax2.bar(ind, bar, width, color='b',edgecolor='white')
ax2.set_title('')
ax2.set_ylim(ymax=10)
fig.text(0.06, 0.5, 't-Statistic', ha='center', va='center', rotation='vertical', fontsize=22)
ax2.set_title('Variables effect on stock market volatility indicator', fontsize=22)
plt.xticks(ind, list(t_stat), rotation='vertical', fontsize=22)
plt.yticks(fontsize=18)
#fig.subplots_adjust(top=-1)
plt.show()

<Figure size 2160x1080 with 0 Axes>


list(X_train)

['retweet_count',
 'favorite_count',
 'hour',
 'neg',
 'neu',
 'pos',
 'num_upper',
 'is_financial',
 'country_mentioned',
 'nouns',
 'verbs',
 'adverbs',
 'adjectives',
 'tweet_count',
 'no_tweet',
 'res_length']


#Normalize data for KNN model
scaler = MinMaxScaler() #scale to 0-1 range
X_train_norm = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
X_test_norm = pd.DataFrame(scaler.fit_transform(X_test), index=X_test.index, columns=X_test.columns)


#Determine KNN model K
for k in [1,2,3,4,5,7,9,10,15,25, 50, 75, 100]:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train_norm, y_train)

    cross_validation_scores_KNN = cross_val_score(knn_model, X_train_norm, y_train.values.flatten(), cv=5)

    #estimating the CV accuracy and stdev of the model
    print("k =", k, "; Accuracy: %0.2f (+/- %0.2f)" % (cross_validation_scores_KNN.mean(), cross_validation_scores_KNN.std()))

k = 1 ; Accuracy: 0.57 (+/- 0.02)
k = 2 ; Accuracy: 0.55 (+/- 0.02)
k = 3 ; Accuracy: 0.58 (+/- 0.05)
k = 4 ; Accuracy: 0.56 (+/- 0.02)
k = 5 ; Accuracy: 0.59 (+/- 0.04)
k = 7 ; Accuracy: 0.61 (+/- 0.04)
k = 9 ; Accuracy: 0.61 (+/- 0.05)
k = 10 ; Accuracy: 0.62 (+/- 0.03)
k = 15 ; Accuracy: 0.63 (+/- 0.04)
k = 25 ; Accuracy: 0.65 (+/- 0.03)
k = 50 ; Accuracy: 0.65 (+/- 0.04)
k = 75 ; Accuracy: 0.66 (+/- 0.03)
k = 100 ; Accuracy: 0.66 (+/- 0.04)


# your code here
knn_model = KNeighborsClassifier(n_neighbors=75)
knn_model.fit(X_train_norm, y_train)

#Accuracy score calcualtion for test set prediction
KNN_test_predict = knn_model.predict(X_test_norm.values)
print('\nAccuracy score of KNN model at K=50 (test set):\n\n', accuracy_score(y_test, KNN_test_predict))

acc_knn = accuracy_score(y_test, KNN_test_predict)

Accuracy score of KNN model at K=50 (test set):

 0.645


cm_knn = metrics.confusion_matrix(y_test, knn_model.predict(X_test_norm.values))
print("Confusion matrix of KNN model predictions using the test dataset: \n\n", cm_knn)
print("\nThe KNN model at K=50 yielded N false positives, and N false negatives.")
TPR_knn = cm_knn[0, 0]/(cm_knn[0, 0]+cm_knn[1, 0])
TNR_knn = cm_knn[1, 1]/(cm_knn[1, 1]+cm_knn[0, 1])

Confusion matrix of KNN model predictions using the test dataset: 

 [[52 48]
 [23 77]]

The KNN model at K=50 yielded N false positives, and N false negatives.

0.616


fpr, tpr, _ = metrics.roc_curve(y_test.values, knn_model.predict_proba(X_test_norm.values).T[1])
auc = metrics.roc_auc_score(y_test.values, knn_model.predict_proba(X_test_norm.values).T[1])
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.legend(loc=4)
plt.xlabel("False positives rate")
plt.ylabel("True positives rate")
plt.title("ROC plot for KNN model (k=75, test dataset)")
plt.show()
auc_knn = auc


#fit logistic model with lasso regularization and 10-fold cross-validation
logreg_m_lasso = LogisticRegressionCV(cv=10, penalty = "l1", solver = 'liblinear') #10-fold cross-validation
logreg_m_lasso = logreg_m_lasso.fit(X_train_norm.values, y_train)

#Accuracy score calcualtion for test set prediction
lgm_test_predict = logreg_m_lasso.predict(X_test_norm.values)
print('\nAccuracy score of multiple logistic model with lasso regularization and 10-fold cross-validation (test set):\n\n', accuracy_score(y_test, lgm_test_predict))

#Accuracy score calcualtion for train set prediction
lgm_train_predict = logreg_m_lasso.predict(X_train_norm.values)
print('\nAccuracy score of multiple logistic model with lasso regularization and 10-fold cross-validation (train set):\n\n', accuracy_score(y_train, lgm_train_predict))

acc_logistic = accuracy_score(y_test, lgm_test_predict)

C:\Users\Yoyo\Anaconda3\lib\site-packages\sklearn\svm\base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)
C:\Users\Yoyo\Anaconda3\lib\site-packages\sklearn\svm\base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)

Accuracy score of multiple logistic model with lasso regularization and 10-fold cross-validation (test set):

 0.625

Accuracy score of multiple logistic model with lasso regularization and 10-fold cross-validation (train set):

 0.6595744680851063


cm_logistic = metrics.confusion_matrix(y_test, logreg_m_lasso.predict(X_test_norm.values))
print("Confusion matrix of logistic model with lasso regularization using the test dataset: \n\n", cm_logistic)
print("\nThe of logistic model with lasso regularization yielded N false positives, and N false negatives.")
TPR_logistic = cm_logistic[0, 0]/(cm_logistic[0, 0]+cm_logistic[1, 0])
TNR_logistic = cm_logistic[1, 1]/(cm_logistic[1, 1]+cm_logistic[0, 1])

Confusion matrix of logistic model with lasso regularization using the test dataset: 

 [[51 49]
 [26 74]]

The of logistic model with lasso regularization yielded N false positives, and N false negatives.


fpr, tpr, _ = metrics.roc_curve(y_test.values, logreg_m_lasso.predict_proba(X_test_norm.values).T[1])
auc = metrics.roc_auc_score(y_test.values, logreg_m_lasso.predict_proba(X_test_norm.values).T[1])
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.legend(loc=4)
plt.xlabel("False positives rate")
plt.ylabel("True positives rate")
plt.title("ROC plot for regularized Logistic model (test dataset)")
plt.show()
auc_logistic = auc


def calc_meanstd(X_train, y_train, depths):
    '''
    
    Calculates variation in model accuracy with tree depth
    Plots train and test dataset accuracy as a function of tree depth
    
    Receives: predictors 2d array, response variable array, and the maximum tree depth.
    
    '''
    
    #Initialize variables
    train_scores = {} # dictionary of cv scores for train dataset
    cvmeans = {} # avarage CV scores - test dataset
    cvstds = {} # standard errors of CV scores - test dataset
    train_means = {} # avarage CV scores - train dataset
    train_std = {} # standard errors of CV scores - train dataset
    
    #Calculate cumulative accuracies 
    for depth in depths:
        dt = DecisionTreeClassifier(max_depth = depth)
        dt.fit(X_train, y_train)
        CV_scores = cross_val_score(dt, X_test, y_test, cv=15)
        train_CV = cross_val_score(dt, X_train, y_train, cv=15)
        train_scores[depth] = train_CV
        train_means[depth] = train_CV.mean()
        train_std[depth] = train_CV.std()
        cvmeans[depth] = CV_scores.mean()
        cvstds[depth] = CV_scores.std()
        
    #plot variables    
    x = list(cvmeans.keys()) # x-axis
    y = list(cvmeans.values()) # accuracies based on test dataset
    y_train = list(train_means.values()) # accuracies based on train dataset
    e = list(cvstds.values()) #error bars
    e_train = list(train_std.values()) #error bars
    lists = sorted(cvmeans.items())
    x, y = zip(*lists) 
    
    #plot
    plt.ylabel("Cross Validation Accuracy")
    plt.xlabel("Maximum Depth")
    plt.title('Variation of Accuracy with Depth - Simple Decision Tree')
    plt.plot(x, y, marker='o', label = 'test set', color='C0')
    plt.plot(x, y_train, marker='o', label = 'train set', color='C3')
    plt.errorbar(x, y, e, fmt='-o', capsize=5, elinewidth=2, markeredgewidth=2, color='C0')
    plt.errorbar(x, y_train, e_train, fmt='-o', capsize=5, elinewidth=2, markeredgewidth=2, color='C3')
    plt.legend()
    plt.show()
    # end of your code here
    return


depths = list(range(1, 21)) # tree depths to test
calc_meanstd(X_train, y_train, depths)


dt = DecisionTreeClassifier(max_depth = 3)
dt.fit(X_train, y_train)
CV_scores = cross_val_score(dt, X_test, y_test, cv=5)


fpr, tpr, _ = metrics.roc_curve(y_test.values, dt.predict_proba(X_test.values).T[1])
auc = metrics.roc_auc_score(y_test.values, dt.predict_proba(X_test.values).T[1])
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.legend(loc=4)
plt.xlabel("False positives rate")
plt.ylabel("True positives rate")
plt.title("ROC plot for Decision Tree model (test dataset)")
plt.show()
auc_dt = auc


#Accuracy score calcualtion for test set prediction
dt_test_predict = dt.predict(X_test.values)
print('\nAccuracy score of decision tree classfier (test set):\n\n', accuracy_score(y_test, dt_test_predict))

#Accuracy score calcualtion for train set prediction
dt_train_predict = dt.predict(X_train.values)
print('\nAccuracy score of decision tree classfier (train set):\n\n', accuracy_score(y_train, dt_train_predict))

acc_dt = dt.score(X_test, y_test)

Accuracy score of decision tree classfier (test set):

 0.655

Accuracy score of decision tree classfier (train set):

 0.6683354192740926


cm_dt = metrics.confusion_matrix(y_test, dt.predict(X_test.values))
print("Confusion matrix of logistic model with decision tree classfier using the test dataset: \n\n", cm_dt)
print("\nThe of decision tree classfier yielded N false positives, and N false negatives.")
TPR_dt = cm_dt[0, 0]/(cm_dt[0, 0]+cm_dt[1, 0])
TNR_dt = cm_dt[1, 1]/(cm_dt[1, 1]+cm_dt[0, 1])

Confusion matrix of logistic model with decision tree classfier using the test dataset: 

 [[62 38]
 [31 69]]

The of decision tree classfier yielded N false positives, and N false negatives.


#Decision tree feature importance
importances = dt.feature_importances_
#std = np.std([tree.feature_importances_ for tree in dt.estimators_],
#            axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances: Decision Tree")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="b", align="center")
plt.xticks(range(X_train.shape[1]), list(X_train.iloc[:, indices]), rotation='vertical')
plt.ylabel("Rank")
plt.xlim([-1, X_train.shape[1]])
plt.show()

Feature ranking:
1. feature 15 (0.512903)
2. feature 2 (0.142624)
3. feature 10 (0.113980)
4. feature 4 (0.081640)
5. feature 9 (0.076154)
6. feature 0 (0.072699)
7. feature 14 (0.000000)
8. feature 13 (0.000000)
9. feature 12 (0.000000)
10. feature 11 (0.000000)
11. feature 8 (0.000000)
12. feature 7 (0.000000)
13. feature 6 (0.000000)
14. feature 5 (0.000000)
15. feature 3 (0.000000)
16. feature 1 (0.000000)


f = tweets.groupby(['Date']).count().reset_index()
d = (f.Date >= datetime.date(year=2016,month=11,day=1))
#display(f)
plt.figure(figsize=(18,10))
#plt.scatter(f['Date'], f['is_retweet'])
x = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['Date']
y1 = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['moving_avg']
y2 = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['Adj Close']  
plt.plot(x, preprocessing.scale(y2), marker = 'o' , label = 'Adj Close', color='green', alpha=0.5)
plt.plot(f[d].Date, preprocessing.scale(f[d].is_retweet), marker = 'o' , label = 'Adj Close', color='grey', alpha=0.5)
plt.plot(f[d].Date, preprocessing.scale(f[d].adjectives), marker = 'o' , label = 'Adj Close', color='orange', alpha=0.5)

#sns.scatterplot(x='Date', y='is_retweet', data=preprocessing.scale(f[d].is_retweet))
plt.ylabel("Tweets per Day", fontsize=24)
plt.xlabel("Date", fontsize=24)
plt.title("Tweets per Day Over Time", fontsize=24)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

datamerge = datamerge.drop(['DateEST'], axis=1)


f = tweets.groupby(['Date']).count().reset_index()
d = (f.Date >= datetime.date(year=2016,month=11,day=1))
#display(f)
plt.figure(figsize=(18,10))
#plt.scatter(f['Date'], f['is_retweet'])
x = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['Date']
y1 = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['moving_avg']
y2 = CBOE[CBOE['Date']>datetime.date(2016, 11, 1)]['Adj Close']  
plt.plot(x, preprocessing.scale(y2), marker = 'o' , label = 'Adj Close', color='green', alpha=0.5)
plt.plot(f[d].Date, preprocessing.scale(f[d].is_retweet), marker = 'o' , label = 'Adj Close', color='grey', alpha=0.5)
plt.plot(f[d].Date, preprocessing.scale(f[d].adjectives), marker = 'o' , label = 'Adj Close', color='orange', alpha=0.5)

#sns.scatterplot(x='Date', y='is_retweet', data=preprocessing.scale(f[d].is_retweet))
plt.ylabel("Tweets per Day", fontsize=24)
plt.xlabel("Date", fontsize=24)
plt.title("Tweets per Day Over Time", fontsize=24)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

datamerge = datamerge.drop(['DateEST'], axis=1)


def calc_meanstd(X_train, y_train, depths):
    '''
    
    Calculates variation in model accuracy with tree depth
    Plots train and test dataset accuracy as a function of tree depth
    
    Receives: predictors 2d array, response variable array, and the maximum tree depth.
    
    '''
    
    #Initialize variables
    train_scores = {} # dictionary of cv scores for train dataset
    cvmeans = {} # avarage CV scores - test dataset
    cvstds = {} # standard errors of CV scores - test dataset
    train_means = {} # avarage CV scores - train dataset
    train_std = {} # standard errors of CV scores - train dataset
    
    #Calculate cumulative accuracies 
    for depth in depths:
        dt = RandomForestClassifier(n_estimators = 145, max_depth=depth, max_features=1+round(sqrt(len(list(X_train)))))
        dt.fit(X_train, y_train)
        CV_scores = cross_val_score(dt, X_test, y_test, cv=15)
        train_CV = cross_val_score(dt, X_train, y_train, cv=15)
        train_scores[depth] = train_CV
        train_means[depth] = train_CV.mean()
        train_std[depth] = train_CV.std()
        cvmeans[depth] = CV_scores.mean()
        cvstds[depth] = CV_scores.std()
        
    #plot variables    
    x = list(cvmeans.keys()) # x-axis
    y = list(cvmeans.values()) # accuracies based on test dataset
    y_train = list(train_means.values()) # accuracies based on train dataset
    e = list(cvstds.values()) #error bars
    e_train = list(train_std.values()) #error bars
    lists = sorted(cvmeans.items())
    x, y = zip(*lists) 
    
    #plot
    plt.ylabel("Cross Validation Accuracy")
    plt.xlabel("Maximum Depth")
    plt.title('Variation of Accuracy with Depth - Random Forest Classifier')
    plt.plot(x, y, marker='o', label = 'test set', color='C0')
    plt.plot(x, y_train, marker='o', label = 'train set', color='C3')
    plt.errorbar(x, y, e, fmt='-o', capsize=5, elinewidth=2, markeredgewidth=2, color='C0')
    plt.errorbar(x, y_train, e_train, fmt='-o', capsize=5, elinewidth=2, markeredgewidth=2, color='C3')
    plt.legend()
    plt.show()
    # end of your code here
    return 

depths = list(range(1, 20)) # tree depths to test
calc_meanstd(X_train, y_train, depths)


from math import sqrt

np.random.seed(0)

forest = RandomForestClassifier(n_estimators = 145, max_depth = 12, max_features=1+round(sqrt(len(list(X_train)))))
forest.fit(X_train, y_train)

random_forest_test_score = accuracy_score(y_test, forest.predict(X_test.values))
print('\nAccuracy score of random forest model with 12 splits and 1+sqrt(N) features selected at each split (test set): \n', random_forest_test_score)

random_forest_train_score = accuracy_score(y_train, forest.predict(X_train.values))
print('\nAccuracy score of random forest model with 12 splits and 1+sqrt(N) features selected at each split (train set): \n', random_forest_train_score)

acc_forest = forest.score(X_test, y_test)

Accuracy score of random forest model with 12 splits and 1+sqrt(N) features selected at each split (test set): 
 0.68

Accuracy score of random forest model with 12 splits and 1+sqrt(N) features selected at each split (train set): 
 0.9924906132665833


cm_forest = metrics.confusion_matrix(y_test, forest.predict(X_test.values))
print("Confusion matrix of random forest model using the test dataset: \n\n", cm_forest)
print("\nThe of decision tree classfier yielded 26 false negatives, and 32 false positives.")
TPR_forest = cm_forest[0, 0]/(cm_forest[0, 0]+cm_forest[1, 0])
TNR_forest = cm_forest[1, 1]/(cm_forest[1, 1]+cm_forest[0, 1])

Confusion matrix of random forest model using the test dataset: 

 [[69 31]
 [33 67]]

The of decision tree classfier yielded 26 false negatives, and 32 false positives.


fpr, tpr, _ = metrics.roc_curve(y_test.values, forest.predict_proba(X_test.values).T[1])
auc = metrics.roc_auc_score(y_test.values, forest.predict_proba(X_test.values).T[1])
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.legend(loc=4)
plt.xlabel("False positives rate")
plt.ylabel("True positives rate")
plt.title("ROC plot for Random Forest model (test dataset)")
plt.show()
auc_forest = auc


#change classification threshold by ROC curve
predict_probabilities = forest.predict_proba(X_test)
predict_mine = np.where(predict_probabilities[:,1] > 0.5, 1, 0)
confusion_matrix(y_test, predict_mine)

array([[69, 31],
       [33, 67]], dtype=int64)


#Random Forest feature importance
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances: Random Forest")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="b", yerr=[std[indices]/1000, std[indices]/2], align="center")
plt.xticks(range(X_train.shape[1]), list(X_train.iloc[:, indices]), rotation='vertical')
plt.ylabel("Rank")
plt.xlim([-1, X_train.shape[1]])
plt.show()

Feature ranking:
1. feature 9 (0.101684)
2. feature 10 (0.096159)
3. feature 15 (0.092949)
4. feature 12 (0.078014)
5. feature 2 (0.073748)
6. feature 4 (0.068919)
7. feature 5 (0.064404)
8. feature 0 (0.063193)
9. feature 3 (0.062921)
10. feature 11 (0.061992)
11. feature 1 (0.057607)
12. feature 6 (0.054340)
13. feature 13 (0.051179)
14. feature 7 (0.044436)
15. feature 8 (0.028148)
16. feature 14 (0.000306)


fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(20,17), sharey=True)
for n, ax in enumerate(axs.ravel()):
    tree_n = DecisionTreeClassifier(max_depth=n+9)
    clf = AdaBoostClassifier(base_estimator=tree_n, n_estimators=200, random_state=10000, learning_rate=0.05)
    clf.fit(X_train, y_train) 

    #iterate over the n-esimators to generate accuracy score arrays for the charts
    train_scores = []
    test_scores = []
    for score in clf.staged_score(X_train, y_train, sample_weight=None): 
        train_scores.append(score) #append to scores array; ultimately y axis in the chart

    for score in clf.staged_score(X_test, y_test, sample_weight=None): 
        test_scores.append(score) #append to scores array; ultimately y axis in the chart

    #plot effect of the n-estimators/iterations on the model accuracy using train and test datasets
    ax.set_ylabel("AdaBoost classifier accuracy")
    ax.set_xlabel("Number of estimators")
    ax.set_title('AdaBoost classifier accuracy with base learner depth of '+str(n+9)+'\nand as a function of n-estimateros (train and test datasets)')
    ax.plot(np.arange(1, 201, 1), train_scores, marker='', label = 'train dataset', color='blue')
    ax.plot(np.arange(1, 201, 1), test_scores, marker='', label = 'test dataset', color='orange')
    ax.legend()
plt.show()


# your code here
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=11), n_estimators=18, random_state=10000, learning_rate=0.05)
clf.fit(X_train, y_train) 

print('\n\n Model accuracy at 18 interations and learning rate of 0.05 (test dataset):\n\n', clf.score(X_test, y_test))
print('\n\n Model accuracy at 18 interations and learning rate of 0.05 (train dataset):\n\n', clf.score(X_train, y_train))
acc_clf = clf.score(X_test, y_test)


 Model accuracy at 18 interations and learning rate of 0.05 (test dataset):

 0.675


 Model accuracy at 18 interations and learning rate of 0.05 (train dataset):

 0.9924906132665833


cm = metrics.confusion_matrix(y_test, clf.predict(X_test.values))
print("Confusion matrix of AdaBoost model predictions using the test dataset: \n\n", cm)
print("\nThe random AdaBoost model yielded N false positives, and N false negatives.")
TPR_clf = cm[0, 0]/(cm[0, 0]+cm[1, 0])
TNR_clf = cm[1, 1]/(cm[1, 1]+cm[0, 1])

Confusion matrix of AdaBoost model predictions using the test dataset: 

 [[69 31]
 [34 66]]

The random AdaBoost model yielded N false positives, and N false negatives.


fpr, tpr, _ = metrics.roc_curve(y_test.values, clf.predict_proba(X_test.values).T[1])
auc = metrics.roc_auc_score(y_test.values, clf.predict_proba(X_test.values).T[1])
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.legend(loc=4)
plt.xlabel("False positives rate")
plt.ylabel("True positives rate")
plt.title("ROC plot of AdaBoost model (test dataset)")
plt.show()
auc_clf = auc


#AdaBoost feature importance
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances: AdaBoost")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="b", yerr=[std[indices]/1000, std[indices]/2], align="center")
plt.xticks(range(X_train.shape[1]), list(X_train.iloc[:, indices]), rotation='vertical')
plt.ylabel("Rank")
plt.xlim([-1, X_train.shape[1]])
plt.show()

Feature ranking:
1. feature 2 (0.206745)
2. feature 3 (0.202626)
3. feature 12 (0.169310)
4. feature 15 (0.099719)
5. feature 4 (0.064997)
6. feature 10 (0.036086)
7. feature 11 (0.032300)
8. feature 5 (0.028962)
9. feature 6 (0.028091)
10. feature 0 (0.026789)
11. feature 9 (0.026025)
12. feature 13 (0.023209)
13. feature 1 (0.019111)
14. feature 8 (0.018921)
15. feature 7 (0.017066)
16. feature 14 (0.000043)


datamerge_reg = datamerge.copy()
datamerge_reg['change'] = datamerge_reg['daily_change_low_high']
datamerge_reg = datamerge_reg.dropna()
#Split to train and test sets
X_train_reg, X_test_reg, y_train_reg, y_test_reg =train_test_split(datamerge_reg.loc[:, list(datamerge_reg.drop(['compound','Adj Close', 'moving_avg', 'daily_change_low', 'daily_change_high', 'daily_change_low_high','daily_change_close', 'Date', 'moving_avg', 'change'], axis=1))], 
                                                         datamerge_reg.daily_change_low_high, test_size=0.2, 
                                                         random_state = 109,  stratify = datamerge.change)


X_train_reg.shape

(799, 16)


from sklearn.preprocessing import power_transform
from sklearn.preprocessing import PowerTransformer

#transform response variable
y_test_reg_tr= power_transform(y_test_reg.values.reshape(-1,1), method='yeo-johnson', standardize=False).reshape(1,-1)[0] 
y_train_reg_tr= power_transform(y_train_reg.values.reshape(-1,1), method='yeo-johnson', standardize=False).reshape(1,-1)[0] 

from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method='yeo-johnson', standardize=False)
pt.fit(X_train_reg)
pt.fit(X_test_reg)
X_train_reg_tr = pt.transform(X_train_reg)
X_test_reg_tr = pt.transform(X_test_reg)


from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


forest_reg = RandomForestRegressor(n_estimators = 150, max_depth=4, max_features=round(sqrt(len(list(X_train)))))
forest_reg.fit(X_train_reg_tr, y_train_reg_tr)
print('R-squared: \n\n', r2_score(y_test_reg_tr, forest_reg.predict(X_test_reg_tr)))
print('\n\nRMSE: \n\n', mean_squared_error(y_test_reg_tr, forest_reg.predict(X_test_reg_tr)))

R-squared: 

 0.16857387659458611


RMSE: 

 0.005669872999686513


datamerge_reg['change'].mean()

0.8591892339871254


#plots histogram of residuals:
sns.set(font_scale=1.35)
from scipy import stats

fig, ax = plt.subplots()

resid = y_train_reg_tr - forest_reg.predict(X_train_reg_tr)
ax.hist(resid, bins=25)
ax.set_title('Histogram of random forest regressor residuals',fontsize=16);
ax.set_title('Residuals vs. fitted values',fontsize=16)
ax.set_ylabel('Frequency',fontsize=16)
ax.set_xlabel('Residual',fontsize=16);

plt.show()

y = y_train_reg_tr
yhat = forest_reg.predict(X_train_reg_tr)

fig, ax = plt.subplots()
ax.scatter(yhat, y-yhat)
line_fit = yhat
plt.axhline(y=0.0, color='r', linestyle='-')

ax.set_title('Residuals vs. Fitted values',fontsize=16)
ax.set_ylabel('y-yhat (Residuals)',fontsize=16)
ax.set_xlabel('yhat (Fitted values)',fontsize=16);
plt.show()


test_predictions = forest_reg.predict(X_train_reg_tr).flatten()

a = plt.axes(aspect='equal')
plt.title('True values vs predictions',fontsize=16)
plt.scatter(y_train_reg_tr, test_predictions)
plt.xlabel('True Indicator Values')
plt.ylabel('Regressor Predictions')
lims = [0, 1]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)


#Random forest regressor feature importance
importances = forest_reg.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest_reg.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="b", yerr=[std[indices]/1000, std[indices]/2], align="center")
plt.xticks(range(X_train.shape[1]), list(X_train.iloc[:, indices]), rotation='vertical')
plt.xlim([-1, X_train.shape[1]])
plt.show()

Feature ranking:
1. feature 9 (0.195572)
2. feature 15 (0.158221)
3. feature 10 (0.143703)
4. feature 12 (0.103107)
5. feature 11 (0.062786)
6. feature 0 (0.044886)
7. feature 2 (0.044204)
8. feature 5 (0.041329)
9. feature 1 (0.039278)
10. feature 6 (0.033268)
11. feature 4 (0.032424)
12. feature 3 (0.030122)
13. feature 7 (0.029206)
14. feature 13 (0.029117)
15. feature 8 (0.012776)
16. feature 14 (0.000001)


X_train_reg_tr.shape

(799, 16)


#Optimized model
from sklearn import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import optimizers

model = tf.keras.models.Sequential(name='fully_connected_neural_network')
count = 0 #counte for chart plotting 

# 3 hidden layers with dropout and regularization setting
model.add(layers.Dense(150, activation='relu', input_shape=(16,), 
                                kernel_regularizer= tf.keras.regularizers.l2(l=0.05)))
model.add(layers.Dropout(0.2))

model.add(layers.Dense(150, activation='relu', 
                                kernel_regularizer= tf.keras.regularizers.l2(l=0.05)))
#model.add(layers.Dropout(0.2))

model.add(layers.Dense(150, activation='relu', 
                                kernel_regularizer= tf.keras.regularizers.l2(l=0.05)))
#model.add(layers.Dropout(0.2))

model.add(layers.Dense(150, activation='relu', 
                                kernel_regularizer= tf.keras.regularizers.l2(l=0.05)))



# output layer
model.add(layers.Dense(1,  activation='sigmoid'))

model.compile(optimizer='adam',
                loss='binary_crossentropy', metrics=['accuracy'])


model.summary()

#Fit model using scaled data
model_fit = model.fit(preprocessing.scale(X_train.values), y_train.values, validation_data=(preprocessing.scale(X_test.values), y_test.values), epochs=200, batch_size=128, verbose=0)


#Calcualte model accuracy
train_loss, train_acc = model.evaluate(preprocessing.scale(X_train.values), y_train.values, verbose=0)
#print("Model train accuracy: ", model_fit.history['accuracy'][len(model_fit.history['accuracy'])-1])
#print("Model validation accuracy: ", model_fit.history['val_accuracy'][len(model_fit.history['accuracy'])-1])

# summarize history for model accuracy
plt.figure(figsize=(10, 5))
plt.plot(model_fit.history['accuracy'])
plt.plot(model_fit.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train accuracy', 'validation accuracy'], loc='lower right')
plt.show()

Model: "fully_connected_neural_network"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_235 (Dense)            (None, 150)               2550      
_________________________________________________________________
dropout_53 (Dropout)         (None, 150)               0         
_________________________________________________________________
dense_236 (Dense)            (None, 150)               22650     
_________________________________________________________________
dense_237 (Dense)            (None, 150)               22650     
_________________________________________________________________
dense_238 (Dense)            (None, 150)               22650     
_________________________________________________________________
dense_239 (Dense)            (None, 1)                 151       
=================================================================
Total params: 70,651
Trainable params: 70,651
Non-trainable params: 0
_________________________________________________________________


model = tf.keras.models.Sequential(name='fully_connected_neural_network')
count = 0 #counte for chart plotting 

# 3 hidden layers with dropout and regularization setting
model.add(layers.Dense(150, activation='relu', input_shape=(16,), 
                                kernel_regularizer= tf.keras.regularizers.l2(l=0.05)))
model.add(layers.Dropout(0.2))

model.add(layers.Dense(150, activation='relu', 
                                kernel_regularizer= tf.keras.regularizers.l2(l=0.05)))
#model.add(layers.Dropout(0.2))

model.add(layers.Dense(150, activation='relu', 
                                kernel_regularizer= tf.keras.regularizers.l2(l=0.05)))
#model.add(layers.Dropout(0.2))

model.add(layers.Dense(150, activation='relu', 
                                kernel_regularizer= tf.keras.regularizers.l2(l=0.05)))



# output layer
model.add(layers.Dense(1,  activation='sigmoid'))

model.compile(optimizer='adam',
                loss='binary_crossentropy', metrics=['accuracy'])


model.summary()

#Fit model using scaled data
model_fit = model.fit(preprocessing.scale(X_train.values), y_train.values, validation_data=(preprocessing.scale(X_test.values), y_test.values), epochs=10, batch_size=128, verbose=0)

Model: "fully_connected_neural_network"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_142 (Dense)            (None, 150)               2550      
_________________________________________________________________
dropout_33 (Dropout)         (None, 150)               0         
_________________________________________________________________
dense_143 (Dense)            (None, 150)               22650     
_________________________________________________________________
dense_144 (Dense)            (None, 150)               22650     
_________________________________________________________________
dense_145 (Dense)            (None, 150)               22650     
_________________________________________________________________
dense_146 (Dense)            (None, 1)                 151       
=================================================================
Total params: 70,651
Trainable params: 70,651
Non-trainable params: 0
_________________________________________________________________


# your code here
print("Neural Net model train accuracy: ", model_fit.history['accuracy'][len(model_fit.history['accuracy'])-1])
print("Neural Neet model validation accuracy: ", model_fit.history['val_accuracy'][len(model_fit.history['accuracy'])-1])
acc_nn = model_fit.history['val_accuracy'][len(model_fit.history['accuracy'])-1]


cm = metrics.confusion_matrix(y_test.values, model.predict_classes(preprocessing.scale(X_test.values)))
print("\nConfusion matrix of AdaBoost model predictions using the test dataset: \n\n", cm)
print("\nNeural net classifier yielded N false positives, and N false negatives.")
TPR_nn = cm[0, 0]/(cm[0, 0]+cm[1, 0])
TNR_nn = cm[1, 1]/(cm[1, 1]+cm[0, 1])

fpr, tpr, _ = metrics.roc_curve(y_test.values, model.predict(preprocessing.scale(X_test.values)))
auc = metrics.roc_auc_score(y_test.values, model.predict(preprocessing.scale(X_test.values)))
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.legend(loc=4)
plt.xlabel("False positives rate")
plt.ylabel("True positives rate")
plt.title("ROC plot of Neural Net model (test dataset)")
plt.show()
auc_nn = auc

Neural Net model train accuracy:  0.64580727
Neural Neet model validation accuracy:  0.65

Confusion matrix of AdaBoost model predictions using the test dataset: 

 [[69 31]
 [39 61]]

Neural net classifier yielded N false positives, and N false negatives.


#Optimized model
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import optimizers


model = tf.keras.models.Sequential(name='fully_connected_neural_network')

# 4 hidden layers with dropout and regularization setting
model.add(layers.Dense(150, activation='relu', input_shape=(16,), 
                                kernel_regularizer= tf.keras.regularizers.l2(l=0.05)))
model.add(layers.Dropout(0.2))

model.add(layers.Dense(150, activation='relu', 
                                kernel_regularizer= tf.keras.regularizers.l2(l=0.05)))
#model.add(layers.Dropout(0.2))

model.add(layers.Dense(150, activation='relu', 
                                kernel_regularizer= tf.keras.regularizers.l2(l=0.05)))

model.add(layers.Dense(150, activation='relu', 
                                kernel_regularizer= tf.keras.regularizers.l2(l=0.05)))

# output layer
model.add(layers.Dense(1,  activation='linear'))



model.compile(optimizer='Adam',
                  loss=['mse'],
                  metrics=['mse'])

model.optimizer.learningRate = 1

model.summary()

model_fit = model.fit(preprocessing.scale(X_train_reg_tr), y_train_reg_tr, validation_data=(preprocessing.scale(X_test_reg_tr), y_test_reg_tr), epochs=300, batch_size=128, verbose=0)

Model: "fully_connected_neural_network"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_225 (Dense)            (None, 150)               2550      
_________________________________________________________________
dropout_51 (Dropout)         (None, 150)               0         
_________________________________________________________________
dense_226 (Dense)            (None, 150)               22650     
_________________________________________________________________
dense_227 (Dense)            (None, 150)               22650     
_________________________________________________________________
dense_228 (Dense)            (None, 150)               22650     
_________________________________________________________________
dense_229 (Dense)            (None, 1)                 151       
=================================================================
Total params: 70,651
Trainable params: 70,651
Non-trainable params: 0
_________________________________________________________________


import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
from tensorflow import keras
from tensorflow.keras import layers

EPOCHS = 200

history = model.fit(
  preprocessing.scale(X_train_reg_tr), y_train_reg_tr,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
  callbacks=[tfdocs.modeling.EpochDots()])



hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()


plotter = tfdocs.plots.HistoryPlotter(smoothing_std=2)


plotter.plot({'Basic': history}, metric = "mse")
#plt.ylim([0, 20])
plt.ylabel('MSE [Volatility Indicator]')

Epoch: 0, loss:0.0076,  mse:0.0076,  val_loss:0.0074,  val_mse:0.0074,  
....................................................................................................
Epoch: 100, loss:0.0077,  mse:0.0077,  val_loss:0.0076,  val_mse:0.0076,  
....................................................................................................

Text(0, 0.5, 'MSE [Volatility Indicator]')


loss, mse = model.evaluate(preprocessing.scale(X_train_reg_tr), y_train_reg_tr, verbose=2)

print("Testing MSE: {:5.5f} index".format(mse))

799/1 - 0s - loss: 0.0070 - mse: 0.0076
Testing MSE: 0.00762 index


print('R-squared: \n\n', r2_score(y_test_reg_tr, model.predict(preprocessing.scale(X_test_reg_tr))))
print('\n\nRMSE: \n\n', mean_squared_error(y_test_reg_tr, model.predict(preprocessing.scale(X_test_reg_tr))))

R-squared: 

 -0.0066767957888778096


RMSE: 

 0.008025926757142062


test_predictions = model.predict(preprocessing.scale(X_train_reg_tr)).flatten()

a = plt.axes(aspect='equal')
plt.title('True values vs predictions',fontsize=16)
plt.scatter(y_train_reg_tr, test_predictions)
plt.xlabel('True Indicator Values')
plt.ylabel('Regressor Predictions')
lims = [0, 0.75]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)


#plots histogram of residuals:
sns.set(font_scale=1.35)
from scipy import stats

fig, ax = plt.subplots()

resid = y_train_reg_tr - model.predict(preprocessing.scale(X_train_reg_tr)).reshape(1,-1)[0]
ax.hist(resid, bins=25)
ax.set_title('Histogram of neural net regressor residuals',fontsize=16);
ax.set_title('Residuals vs. fitted values',fontsize=16)
ax.set_ylabel('Frequency',fontsize=16)
ax.set_xlabel('Residuals',fontsize=16);

plt.show()

y = y_train_reg_tr
yhat = model.predict(preprocessing.scale(X_train_reg_tr)).reshape(1,-1)[0]

fig, ax = plt.subplots()
ax.scatter(yhat, y-yhat)
line_fit = yhat
plt.axhline(y=0.0, color='r', linestyle='-')

ax.set_title('Residuals vs. Fitted values',fontsize=16)
ax.set_ylabel('y-yhat (Residuals)',fontsize=16)
ax.set_xlabel('yhat (Fitted values)',fontsize=16);
plt.show()


pd.DataFrame({'Model': ['KNN', 'l2-Logistic', 'Decision Tree', 'Random Forest', 'AdaBoost', 'Neural Net'], 
              'Acuracy Score': [acc_knn, acc_logistic, acc_dt, acc_forest, acc_clf, acc_nn], 
             'AUC': [auc_knn, auc_logistic, auc_dt, auc_forest, auc_clf, auc_nn],
             'TPR': [TPR_knn, TPR_logistic, TPR_dt, TPR_forest, TPR_clf, TPR_nn],
             'TNR': [TNR_knn, TNR_logistic, TNR_dt, TNR_forest, TNR_clf, TNR_nn]}
            ).round(3).sort_values(by='Acuracy Score')

	source	text	Date	retweet_count	favorite_count	is_retweet	id_str	hour
0	Twitter for iPhone	“Serial killers get more Due Process than the ...	2019-10-14	19867.0	74617	false	1.183541e+18	0.0
1	Twitter for iPhone	Somebody please explain to Chris Wallace of Fo...	2019-10-14	26122.0	106477	false	1.183535e+18	0.0
2	Twitter for iPhone	The U.S. has the worst of the ISIS prisoners. ...	2019-10-13	16861.0	67151	false	1.183525e+18	23.0
3	Twitter for iPhone	.@marklevinshow on @FoxNews is doing a big sho...	2019-10-13	14569.0	50631	false	1.183520e+18	23.0
4	Twitter for iPhone	.....BY THE WAY DON’T CALL ME AGAIN I’LL CALL ...	2019-10-13	18997.0	71535	false	1.183507e+18	22.0

	Date	Open	High	Low	Close	Adj Close	Volume
0	2016-11-01	2128.679932	2131.449951	2097.850098	2111.719971	2111.719971	4532160000
1	2016-11-02	2109.429932	2111.760010	2094.000000	2097.939941	2097.939941	4248580000
2	2016-11-03	2098.800049	2102.560059	2085.229980	2088.659912	2088.659912	3886740000
3	2016-11-04	2083.790039	2099.070068	2083.790039	2085.179932	2085.179932	3837860000
4	2016-11-07	2100.590088	2132.000000	2100.590088	2131.520020	2131.520020	3736060000

	Date	Open	High	Low	Close	Adj Close	Volume
0	2016-10-31	2128.679932	2131.449951	2097.850098	2111.719971	2111.719971	4532160000
1	2016-11-01	2109.429932	2111.760010	2094.000000	2097.939941	2097.939941	4248580000
2	2016-11-02	2098.800049	2102.560059	2085.229980	2088.659912	2088.659912	3886740000
3	2016-11-03	2083.790039	2099.070068	2083.790039	2085.179932	2085.179932	3837860000
4	2016-11-06	2100.590088	2132.000000	2100.590088	2131.520020	2131.520020	3736060000

	Date	Open	High	Low	Close	Adj Close	Volume	moving_avg	daily_change_close	daily_change_high	daily_change_low	daily_change_low_high
724	2019-09-18	3010.360107	3021.989990	3003.159912	3006.790039	3006.790039	3251290000	2964.875366	2.442608	2.960477	2.318928	0.625509
725	2019-09-19	3008.419922	3016.370117	2984.679932	2992.070068	2992.070068	6094740000	2964.690360	1.805608	2.632420	1.554157	1.053383
726	2019-09-22	2983.500000	2999.149902	2982.229980	2991.780029	2991.780029	3186590000	2965.107143	1.714829	1.965391	1.390146	0.567117
727	2019-09-23	3002.429932	3007.979980	2957.729980	2966.600098	2966.600098	3868160000	2967.392857	0.704690	2.109378	0.403583	1.673644
728	2019-09-24	2968.350098	2989.820068	2952.860107	2984.870117	2984.870117	3318870000	2968.406067	1.156541	1.324294	0.071729	1.245135

	source	text	Date	retweet_count	favorite_count	is_retweet	id_str	hour	neg	neu	pos	compound	length	num_upper	country_mentioned
0	Twitter for iPhone	“Serial killers get more Due Process than the ...	2019-10-14	19867.0	74617	false	1.183541e+18	0.0	0.186	0.693	0.121	-0.3612	18	0	0
1	Twitter for iPhone	Somebody please explain to Chris Wallace of Fo...	2019-10-14	26122.0	106477	false	1.183535e+18	0.0	0.000	0.891	0.109	0.6696	49	1	1
2	Twitter for iPhone	The U.S. has the worst of the ISIS prisoners. ...	2019-10-13	16861.0	67151	false	1.183525e+18	23.0	0.177	0.769	0.054	-0.7514	44	2	1
3	Twitter for iPhone	.@marklevinshow on @FoxNews is doing a big sho...	2019-10-13	14569.0	50631	false	1.183520e+18	23.0	0.200	0.800	0.000	-0.8588	42	2	0
4	Twitter for iPhone	.....BY THE WAY DON’T CALL ME AGAIN I’LL CALL ...	2019-10-13	18997.0	71535	false	1.183507e+18	22.0	0.123	0.877	0.000	-0.7345	48	19	0

year-month	2016-11	2016-12	2017-1	2017-10	2017-11	2017-12	2017-2	2017-3	2017-4	2017-5	...	2019-1	2019-10	2019-2	2019-3	2019-4	2019-5	2019-6	2019-7	2019-8	2019-9
neg	0.0576	0.0653333	0.0895735	0.0868	0.0857384	0.08016	0.107902	0.0680222	0.0679203	0.0750347	...	0.0961483	0.0914158	0.0815078	0.0942069	0.102625	0.0832611	0.0788425	0.0808675	0.085702	0.074418
pos	0.158194	0.129894	0.10252	0.14631	0.155219	0.168331	0.11819	0.146741	0.148746	0.141736	...	0.126659	0.107663	0.124218	0.121066	0.133489	0.141037	0.136431	0.133271	0.153673	0.126692
neu	0.78423	0.804773	0.807902	0.766882	0.759051	0.751514	0.773922	0.785259	0.783326	0.783229	...	0.777159	0.800941	0.794249	0.784734	0.763873	0.775727	0.784729	0.785859	0.760646	0.798884

year-month	2016-11	2016-12	2017-1	2017-10	2017-11	2017-12	2017-2	2017-3	2017-4	2017-5	...	2019-1	2019-10	2019-2	2019-3	2019-4	2019-5	2019-6	2019-7	2019-8	2019-9
neg	0.0934457	0.0972801	0.109256	0.102587	0.105664	0.0953548	0.124425	0.104762	0.106756	0.100927	...	0.101128	0.117982	0.0985711	0.114591	0.112635	0.0959443	0.0915755	0.0982932	0.10913	0.100972
pos	0.155262	0.123803	0.121608	0.148582	0.145271	0.144363	0.134818	0.142355	0.145522	0.142204	...	0.117432	0.133024	0.114534	0.147444	0.140379	0.132924	0.132081	0.130687	0.14877	0.147041
neu	0.161257	0.142739	0.137292	0.147623	0.152811	0.13962	0.155457	0.155846	0.153736	0.140993	...	0.132314	0.159276	0.128352	0.161963	0.143658	0.136687	0.143894	0.139845	0.153881	0.157464

year-month	2016-11	2016-12	2017-1	2017-10	2017-11	2017-12	2017-2	2017-3	2017-4	2017-5	...	2019-1	2019-10	2019-2	2019-3	2019-4	2019-5	2019-6	2019-7	2019-8	2019-9
nouns	6.82424	6.45455	6.31863	6.93333	8.52743	10.8057	6.62745	6.68889	6.71014	6.61806	...	10.4345	8.66997	9.8342	8.77586	9.21362	9.86946	9.77901	9.23494	8.87585	8.748
verbs	1.93939	2.14394	2.26471	1.93725	2.56962	3.05714	2.06536	1.65926	1.91304	1.86111	...	3.78621	2.74917	3.17098	2.9	3.1517	3.49507	3.52762	3.28313	3.22348	2.964
descriptive	1.98788	2.02273	2.17647	1.95294	2.8692	3.45143	2.0915	1.93333	2.05072	1.91667	...	3.75172	2.81848	3.46632	3.1069	3.30341	3.68966	3.40055	3.50602	3.55305	2.972

year-month	2016-11	2016-12	2017-1	2017-10	2017-11	2017-12	2017-2	2017-3	2017-4	2017-5	...	2019-1	2019-10	2019-2	2019-3	2019-4	2019-5	2019-6	2019-7	2019-8	2019-9
nouns	2.73628	2.45684	2.16589	2.59305	3.71841	4.58596	2.414	2.54941	2.51478	2.47221	...	4.67758	5.05303	4.95537	4.65354	4.33757	4.3188	4.66386	4.71427	4.62292	4.82774
verbs	1.39554	1.38769	1.48839	1.28436	1.87094	1.95558	1.34102	1.29386	1.35882	1.38226	...	2.5388	2.47567	2.38851	2.51232	2.33364	2.33571	2.6095	2.61365	2.49655	2.57293
descriptive	1.78403	1.61796	1.5499	1.49202	2.30598	2.51357	1.65974	1.61245	1.69293	1.60201	...	2.74406	2.50629	2.78765	2.73241	2.82088	2.62321	2.46803	2.97491	2.86084	2.71919

	Date	tweet_count
0	2015-01-05	141
1	2015-01-06	107
2	2013-07-24	88
3	2016-10-20	87
4	2013-02-27	86

	Date	retweet_count	favorite_count	hour	neg	neu	pos	compound	length	num_upper	is_financial	country_mentioned	nouns	verbs	adverbs	adjectives	tweet_count
2989	2019-09-25	17074.304348	65088.260870	10.956522	0.045174	0.883957	0.070783	0.031443	19.217391	1.086957	0.260870	0.304348	6.695652	2.521739	0.652174	0.826087	23
2990	2019-09-26	16470.052632	69631.631579	13.578947	0.131947	0.698632	0.169474	0.137826	21.578947	2.684211	0.421053	0.105263	7.684211	1.578947	0.684211	1.473684	19
2991	2019-09-27	21337.136364	79088.409091	9.045455	0.072455	0.887682	0.039909	-0.161205	23.045455	1.954545	0.045455	0.318182	6.590909	2.954545	1.227273	1.590909	22
2992	2019-09-28	26257.923077	111616.076923	14.461538	0.101231	0.777615	0.121154	0.056792	22.153846	2.307692	0.076923	0.153846	6.769231	1.769231	1.384615	1.615385	13
2993	2019-09-29	19495.125000	72159.000000	14.250000	0.064313	0.829875	0.105812	0.015494	21.250000	0.812500	0.250000	0.062500	6.562500	2.375000	1.375000	1.437500	16
2994	2019-09-30	19440.121212	75423.666667	10.909091	0.062455	0.855030	0.082545	0.087415	29.727273	2.060606	0.151515	0.363636	9.696970	2.909091	1.303030	1.272727	33
2995	2019-10-01	26376.400000	104304.066667	13.866667	0.053867	0.851667	0.094467	0.137720	31.133333	1.066667	0.133333	0.066667	9.533333	2.933333	1.466667	1.466667	15
2996	2019-10-02	29192.150000	104145.350000	15.300000	0.103850	0.791000	0.105150	0.026060	33.050000	1.750000	0.450000	0.050000	10.100000	3.600000	1.100000	2.000000	20
2997	2019-10-03	16996.080000	65096.440000	15.880000	0.087000	0.779840	0.133120	0.101512	17.000000	0.520000	0.240000	0.040000	6.800000	1.680000	0.800000	1.320000	25
2998	2019-10-04	20565.172414	78005.172414	12.103448	0.071379	0.873759	0.054862	-0.004862	24.482759	0.862069	0.206897	0.034483	7.931034	2.137931	0.862069	1.655172	29
2999	2019-10-05	18869.233333	69711.533333	9.733333	0.094167	0.838733	0.067133	-0.019643	19.766667	1.333333	0.100000	0.266667	7.033333	2.000000	0.600000	0.900000	30
3000	2019-10-06	19209.160000	76156.440000	15.600000	0.059800	0.844400	0.095840	0.117840	23.880000	0.960000	0.320000	0.120000	7.880000	2.600000	0.800000	1.320000	25
3001	2019-10-07	18939.600000	71426.950000	12.500000	0.098900	0.802000	0.099150	-0.061275	38.750000	2.900000	0.450000	0.400000	10.000000	4.300000	2.000000	2.500000	20
3002	2019-10-08	16906.181818	63532.500000	13.136364	0.062545	0.780727	0.156727	0.329145	30.045455	0.954545	0.590909	0.227273	10.363636	3.181818	1.227273	2.090909	22
3003	2019-10-09	15753.972222	60207.944444	10.666667	0.111917	0.772250	0.115944	-0.012267	28.222222	1.888889	0.194444	0.055556	8.472222	2.722222	1.111111	1.888889	36
3004	2019-10-10	18020.500000	71392.300000	11.350000	0.106400	0.812500	0.081050	-0.162165	29.150000	1.550000	0.150000	0.500000	8.650000	3.250000	1.150000	1.850000	20
3005	2019-10-11	12619.571429	50638.333333	13.714286	0.125143	0.634714	0.240095	0.320157	23.285714	1.380952	0.238095	0.095238	8.190476	2.142857	0.952381	2.095238	21
3006	2019-10-12	17773.500000	72197.722222	12.111111	0.107222	0.808944	0.083944	0.065067	24.333333	1.277778	0.222222	0.222222	9.055556	2.111111	0.555556	1.777778	18
3007	2019-10-13	17595.500000	70724.250000	15.300000	0.099700	0.812100	0.088200	-0.098710	37.250000	3.750000	0.200000	0.500000	10.550000	4.050000	1.450000	2.700000	20
3008	2019-10-14	22994.500000	90547.000000	0.000000	0.093000	0.792000	0.115000	0.154200	33.500000	0.500000	0.000000	0.500000	11.000000	2.500000	1.500000	0.500000	2

	Date	retweet_count	favorite_count	hour	neg	neu	pos	compound	length	num_upper	...	adverbs	adjectives	tweet_count	Adj Close	moving_avg	daily_change_close	daily_change_high	daily_change_low	daily_change_low_high
1026	2019-09-21	15857.117647	57961.176471	17.352941	0.099118	0.825824	0.075059	-0.138959	24.705882	0.470588	...	1.000000	1.705882	17.0	2979.190063	2966.250000	1.209759	2.037384	0.896864	1.120380
1027	2019-09-22	19649.125000	80410.687500	12.000000	0.077188	0.800250	0.122500	0.031713	22.187500	0.375000	...	1.437500	1.625000	16.0	2991.780029	2965.107143	1.714829	1.965391	1.390146	0.567117
1028	2019-09-23	19151.636364	80758.727273	12.727273	0.082727	0.814091	0.103091	0.025309	23.545455	0.454545	...	0.909091	1.363636	11.0	2966.600098	2967.392857	0.704690	2.109378	0.403583	1.673644
1029	2019-09-24	25912.066667	107750.800000	15.533333	0.069933	0.820933	0.109133	-0.066920	17.400000	0.666667	...	1.000000	1.333333	15.0	2984.870117	2968.406067	1.156541	1.324294	0.071729	1.245135
1030	2019-09-25	17074.304348	65088.260870	10.956522	0.045174	0.883957	0.070783	0.031443	19.217391	1.086957	...	0.652174	0.826087	23.0	2402.619995	2396.529641	1.171761	1.492007	0.732113	0.750315

	Date	retweet_count	favorite_count	hour	neg	neu	pos	compound	length	num_upper	...	adjectives	tweet_count	Adj Close	moving_avg	daily_change_close	daily_change_high	daily_change_low	daily_change_low_high
32	2016-12-06	21109.600000	75222.600000	17.600000	0.094400	0.884800	0.020800	-0.239360	18.000000	0.200000	...	0.400000	5.0	2259.530029	2233.809274	3.696176	3.708568	3.223478	0.469837
33	2016-12-07	9467.500000	43229.750000	13.000000	0.000000	0.871750	0.128250	0.323650	15.000000	1.000000	...	0.250000	4.0	2259.530029	2233.809274	3.696176	3.708568	3.223478	0.469837
34	2016-12-08	11400.000000	48998.800000	13.400000	0.057400	0.748200	0.194400	0.400440	20.600000	0.400000	...	1.400000	5.0	2259.530029	2233.809274	3.696176	3.708568	3.223478	0.469837
35	2016-12-09	7933.000000	33396.666667	14.666667	0.000000	0.956667	0.043333	0.152467	16.000000	1.000000	...	0.666667	3.0	2256.959961	2235.842494	3.332243	3.655938	3.122103	0.516193
36	2016-12-10	12998.571429	51251.285714	12.000000	0.065571	0.761857	0.172429	0.300100	19.714286	2.571429	...	1.000000	7.0	2264.339966	2236.907676	3.522775	3.817249	3.225960	0.572015

	Predictors	t-Statistic
16	res_length	8.491517
15	verbs	8.232335
14	nouns	7.579926
13	adjectives	7.252414
12	adverbs	6.410645
11	favorite_count	3.793288
10	retweet_count	3.061613
9	tweet_count	2.476251
8	num_upper	2.417248
7	is_financial	2.122675
6	country_mentioned	2.121410
5	hour	1.175000
4	compound	1.082760
3	neg	0.981493
2	no_tweet	0.471056
1	neu	0.072219
0	pos	0.046118

	Model	Acuracy Score	AUC	TPR	TNR
1	l2-Logistic	0.625	0.661	0.662	0.602
0	KNN	0.645	0.668	0.693	0.616
5	Neural Net	0.650	0.672	0.670	0.634
2	Decision Tree	0.655	0.680	0.667	0.645
4	AdaBoost	0.675	0.687	0.670	0.680
3	Random Forest	0.680	0.685	0.676	0.684

CSCI-109A Team #91, Fall 2019

CS109A Introduction to Data Science

COMPSCI 109A - Group Project Notebook¶

Submitted by: Ben Horev, Katrine Pertsovski, Parth Patel¶

Import libraries and data¶

Get Trump tweets data; convert dates to datetime and remove retweets¶

Import financial indicator data¶

Create response variables adjusted for time series trend¶

Plot response variables and change in response variables¶

Sentiment analysis of the tweets¶

Concatenate sentiment analysis with initial dataframe and convert dates column to regular dates¶

Create a list of financial keywords to determine if the tweets are financial¶

Create a list of financial keywords semantically similar to ['money', 'capital', 'financial_market', 'trade']¶

Convert financial list of words to roots of words¶

Create additional predictors¶

Function that counts nouns, verbs, adverbs and adjectives¶

Count nouns, verbs, and adverbs¶

Sentiment analysis figures¶

Distribution of negative and positive scores over time¶

Drop unnecessary tweets columns¶

Merge response variable and tweet related features¶

Perform outer mege of tweet features with financial indicator¶

Parth and Katrine to provide code for imputation of missting days stock market days¶

Create threshold variable for binary classification (Median)¶

Plot interactions among features and the response variable¶

Frequency of word catergorises were substracted from the length variable to reduce collinearity among variables. As the compound variable was collinear with neg and pos sentiments, this variable was not incorporated in to the model.¶

Plot interations among predictors¶

Split data and determine to train and test datasets and drop unnecessary columns for model generation¶

Calcualte t-statistic of each feature with regards to stock market volatility indicator¶

Fit Classification Models¶

KNN model¶

Fit logistic model with lasso regularizaton¶

Plot ROC curve and calcualte AUC for logistic model¶

Fit decision tree classifier¶

Model feature importance¶

Fit random forest classifier¶

Determination of optimal decision tree classifier depth¶

Fit the model¶

Model feature importance¶

Fit AdaBoost classifier¶

Model feature importance¶

Fit random forest regressor¶

Prediction of daily S&P500 daily fluctuation range¶

Normalize variables data¶

Neural Net classifier¶

Refit neural net model using 10 epoch and batch of 128¶

Neural Net Regressor¶

Create standardized features for neural net regressor using preprocessing.scale()¶

Summary of classification models metrics¶