diff --git a/Dataset modeling for financial time series data.md b/Dataset modeling for financial time series data.md new file mode 100644 index 0000000..dfa8b0d --- /dev/null +++ b/Dataset modeling for financial time series data.md @@ -0,0 +1,3545 @@ + +# Dataset modeling for Financial Time Series Data +This document aims to provide information on the research related to find the best format to represent financial time series data with certain data analysis for the usage of machine learning techniques + +## On the data provided - Overview + + +```python +%matplotlib inline + +import pandas as pd +import pandas_datareader as web +from IPython.core.display import display +import matplotlib.pylab as plt +from stockstats import StockDataFrame +import seaborn as sns +sns.set() + +df = web.DataReader('BRL=X', 'yahoo') +data = pd.DataFrame(df) +data = StockDataFrame.retype(data) +display(data.head()) +data.plot(figsize=(15,10)) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseadj closevolume
Date
2010-01-041.69301.74121.67231.71901.71900.0
2010-01-051.67131.73701.67131.73701.73700.0
2010-01-061.67981.73591.67981.73151.73150.0
2010-01-071.72421.74721.68051.73891.73890.0
2010-01-081.69541.74921.69541.73201.73200.0
+
+ + + + + + + + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_2_2.png) + + +## On the indicators + + +```python +%matplotlib inline + +import pandas as pd +import pandas_datareader as web +from IPython.core.display import display +import matplotlib.pylab as plt +from stockstats import StockDataFrame +import seaborn as sns +sns.set() + +data = pd.read_csv('USDBRL/all_indicators.csv') +data = StockDataFrame.retype(data) +copy = data.copy() +display(data.tail()) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseadj closevolumeclose_20_smaclose_20_mstdbollboll_ub...mdi_14mdidx_14dxdx_6_emaadxadx_6_emaadxrtrixtrix_9_sma
date
2018-01-223.19123.20633.18283.19473.19470.03.251310.0453473.251313.342003...32.42446432.42446450.39382650.39382644.70556244.70556246.14526246.145262-0.104079-0.070007
2018-01-233.20073.23643.19863.20073.20070.03.244570.0420743.244573.328719...27.45617127.45617112.09310812.09310835.38771835.38771843.07167843.071678-0.108291-0.079818
2018-01-243.23373.23823.17573.23553.23550.03.240860.0392023.240863.319265...31.17443031.17443028.15480828.15480833.32117233.32117240.28581940.285819-0.107148-0.087835
2018-01-253.14513.14843.12153.14513.14510.03.232450.0408513.232453.314153...41.19458041.19458052.07050952.07050938.67812638.67812639.82647839.826478-0.112533-0.094800
2018-01-263.14543.15433.13123.14693.14690.03.224240.0407123.224243.305665...36.82179636.82179645.96752445.96752440.76081140.76081140.09343040.093430-0.120949-0.101018
+

5 rows × 69 columns

+
+ + +## Handling missing data (Data Cleaning) + + +```python +#How much of the data is missing +counter_nan = data.isnull().sum().sort_values(ascending=False) +plt.figure(figsize=(15,10)) +plt.scatter(counter_nan, counter_nan.values) +plt.show() +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_6_0.png) + + + +```python +#how many columns does not have a single nan +counter_without_nan = counter_nan[counter_nan==0] +print " [+] Number of columns that does not have a nan: " + str(len(counter_without_nan)) +print " [+] Number of total columns: " + str(len(data.columns)) +``` + + [+] Number of columns that does not have a nan: 24 + [+] Number of total columns: 69 + + +###### Much of the encountered NaN are caused from the indicators necessity for previous data + + +```python +display(data[counter_nan.keys()].head()) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
cci_20ccitrhigh_deltaumlow_deltadmclose_-1_dcr-ma3close_-1_s...kdjk_9close_10_smamacdsclose_50_smadmapdmpdm_14_emapdm_14macdhmacd
date
2010-01-04NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...55.9264631.7190000.0000001.7190000.00.00000.0000000.0000000.0000000.000000
2010-01-0566.66666766.6666670.0657-0.00420.0000-0.00100.0010.0180NaN1.7190...68.6147811.7280000.0002241.7280000.00.00000.0000000.0000000.0003590.000404
2010-01-0660.36363660.3636360.0572-0.00110.00000.00850.000-0.0055NaN1.7370...74.4508651.7291670.0002731.7291670.00.00000.0000000.0000000.0001410.000344
2010-01-07133.333333133.3333330.06670.01130.01130.00070.0000.0074NaN1.7315...79.3220961.7316000.0003761.7316000.00.01130.0034570.0034570.0004000.000576
2010-01-08106.533036106.5330360.05380.00200.00200.01490.000-0.0069NaN1.7389...78.8548681.7316800.0003871.7316800.00.00200.0030770.0030770.0000550.000415
+

5 rows × 69 columns

+
+ + +###### Erasing equal or all zero columns + + +```python +from pandas.util.testing import assert_series_equal +import numpy as np + +# Taking out columns that have all values as 0 or equal values +data = StockDataFrame.retype(data) +cols = data.select_dtypes([np.number]).columns +diff = data[cols].diff().sum() + +data = data.drop(diff[diff==0].index, axis=1) +data = data.drop('adj close', 1) +display(data.tail()) + +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseclose_20_smaclose_20_mstdbollboll_ubboll_lbclose_-1_s...mdi_14mdidx_14dxdx_6_emaadxadx_6_emaadxrtrixtrix_9_sma
date
2018-01-223.19123.20633.18283.19473.251310.0453473.251313.3420033.1606173.2051...32.42446432.42446450.39382650.39382644.70556244.70556246.14526246.145262-0.104079-0.070007
2018-01-233.20073.23643.19863.20073.244570.0420743.244573.3287193.1604213.1947...27.45617127.45617112.09310812.09310835.38771835.38771843.07167843.071678-0.108291-0.079818
2018-01-243.23373.23823.17573.23553.240860.0392023.240863.3192653.1624553.2007...31.17443031.17443028.15480828.15480833.32117233.32117240.28581940.285819-0.107148-0.087835
2018-01-253.14513.14843.12153.14513.232450.0408513.232453.3141533.1507473.2355...41.19458041.19458052.07050952.07050938.67812638.67812639.82647839.826478-0.112533-0.094800
2018-01-263.14543.15433.13123.14693.224240.0407123.224243.3056653.1428153.1451...36.82179636.82179645.96752445.96752440.76081140.76081140.09343040.093430-0.120949-0.101018
+

5 rows × 66 columns

+
+ + +###### Slicing the index gives us a pretty simple solution with minimum data miss for the indicator necessity on previous data + + + + +```python +data = data[14:-14] +counter_nan = data.isnull().sum().sort_values(ascending=False) +display(data[counter_nan.keys()].head()) +plt.figure(figsize=(15,10)) +plt.scatter(counter_nan, counter_nan.values) +plt.show() +print " [+] Number of columns that does not have a nan: " + str(len(counter_nan)) +print " [+] Number of total columns: " + str(len(data.columns)) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
cci_20ccilow_deltaumhigh_deltatrclose_-1_ddmwr_6open...mdm_14mdi_14trixkdjjkdjj_9kdjdkdjd_9kdjkkdjk_9trix_9_sma
date
2010-01-22178.996300176.8077990.01300.02690.02690.07760.01800.00009.2927631.7525...0.0019413.0851130.14341697.52837797.52837792.85676892.85676894.41397194.4139710.083942
2010-01-25128.966672124.2965060.01300.0000-0.00880.0558-0.03530.000038.8009991.8189...0.0016532.6593990.15534473.82714873.82714890.13825190.13825184.70121784.7012170.096051
2010-01-26197.350586184.5210320.04740.02470.02470.06250.05010.00009.1176471.8136...0.0014112.2693880.17296882.36216382.36216389.02738289.02738286.80564286.8056420.110112
2010-01-27170.239369148.954115-0.02690.02030.02030.08030.01600.026911.5331491.7860...0.0050907.9531660.19535585.87436685.87436688.57695188.57695187.67608987.6760890.125540
2010-01-28166.319888142.5871030.02040.00490.00490.06480.01840.00002.4297651.8064...0.0043636.8095810.22210194.51622994.51622989.42541989.42541991.12235691.1223560.142624
+

5 rows × 66 columns

+
+ + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_13_1.png) + + + [+] Number of columns that does not have a nan: 66 + [+] Number of total columns: 66 + + +###### After slicing we can backfill NaN values for holidays and exceptional days on the market + + +```python +#Back filling for holidays and exceptional days on the market +data = data.fillna(method='bfill') +data = data[1:-1] +counter_without_nan = data.isnull().sum().sort_values(ascending=False) +print " [+] Number of columns that does not have a nan: " + str(len(counter_without_nan)) +print " [+] Number of total columns: " + str(len(data.columns)) +``` + + [+] Number of columns that does not have a nan: 66 + [+] Number of total columns: 66 + + +## Data Exploring + + +```python +def plot_histogram(x): + plt.figure(figsize=(15,10)) + plt.hist(x, alpha=0.5) + plt.title("Histogram of '{var_name}'".format(var_name=x.name)) + plt.xlabel("Value") + plt.ylabel("Frequency") + plt.show() +``` + + +```python +plot_histogram(data['macdh']) +plot_histogram(data['cci']) +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_18_0.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_18_1.png) + + +###### Exploring the distribution of percentage change in the close value + + +```python +import matplotlib.mlab as mlab + +mu = data['close_-1_r'].mean() +sigma = data['close_-1_r'].std() +x = data['close_-1_r'] +num_bins = 50 +fig, ax = plt.subplots(figsize=(15,10)) +n, bins, patches = ax.hist(x, num_bins, normed=1) +y = mlab.normpdf(bins, mu, sigma) +ax.plot(bins, y, '--') +ax.set_title('Histogram of 1-day Change $\mu=' + str(mu) + '$, $\sigma=' + str(sigma) + '$') +plt.show() +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_20_0.png) + + +###### Making our first label of 1 day future forecast for feature exploration + + +```python +label_display = pd.DataFrame() +label_display['close'] = data['close'] +label_display['from_yesterday_rate'] = data['close_-1_r'] +y1 = data['close_-1_r'].shift(-1) +y1 = y1.apply(lambda x:1 if x>0.0000 else 0) +label_display['y'] = y1 +label_display['c1'] = c1 +display(label_display.head(7)) + +``` + +###### Exploring influence of feature on outcome target + + +```python +def plot_histogram_dv(x,y): + plt.figure(figsize=(15,10)) + plt.hist(list(x[y==0]), alpha=0.5, label='Bear') + plt.hist(list(x[y==1]), alpha=0.5, label='Bull') + plt.title("Histogram of '{var_name}' by Forecast Target".format(var_name=x.name)) + plt.xlabel("Value") + plt.ylabel("Frequency") + plt.legend(loc='upper right') + plt.show() +``` + + +```python +plot_histogram_dv(data['macdh'], y1) +plot_histogram_dv(data['cci'], y1) +plot_histogram_dv(data['adx'], y1) +plot_histogram_dv(data['kdjk'], y1) +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_25_0.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_25_1.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_25_2.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_25_3.png) + + +## Feature Engineering + +###### Normalizing and Standardizing distributions +Different techniques to represent a price movement can be used to select the one with best results + + +```python +data.plot(x=data.index, y=['close_20_sma','adx', 'cci'], figsize=(15, 10)) + +``` + + + + + + + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_28_1.png) + + +#### As shown above, different indicators have different metrics, so we need to normalize in various ways and search for the best results + +###### First let's explore the behavior of each target label + + +```python +#Labeling the different window frames +##Signaling the difference between a feature datapoint and the previous/next one + +def labelwf(dataframe, wf): + for i in wf: + swf = str(i) + dataframe['label' + swf] = \ + (dataframe['close'] - dataframe['close'].shift(i))/dataframe['close'].shift(i) + dataframe['label' + swf] = dataframe['label' + swf].apply(lambda x:1 if x>0.0 else 0) + return dataframe + +#Negative for looking future datapoints +#Positive for looking backwards +window_frames = [-1, -2, -15, 1, 2, 15] +labeled_data = labelwf(data.copy(), window_frames) +index = list(range(len(data))) +index = index[-250:-15] +label1 = labeled_data['label-1'].values +label1 = label1[-250:-15] +label15 = labeled_data['label-15'].values +label15 = label15[-250:-15] +c1 = copy['close_1_r'].apply(lambda x:0 if x>0.000 else 1) +c15 = copy['close_15_r'].apply(lambda x:0 if x>0.000 else 1) +y_5 = copy['close_5_r'].apply(lambda x:0 if x>0.000 else 1) +y_10 = copy['close_10_r'].apply(lambda x:0 if x>0.000 else 1) +y_30 = copy['close_30_r'].applu(lambda x:0 if x>0.000 else 1) +index = list(range(len(c1))) +index = index[-250:-15] + +fig, ax = plt.subplots(figsize=(15, 8), sharey=True) +ax.plot(index, c1[-250:-15], label='1d forward', color='r') +ax.scatter(index, c15[-250:-15], label='15d forward', color='g') +ax.legend() + + +labeled_data['index'] = list(range(len(data))) +data.plot(y='close', figsize=(15, 8)) +for r in labeled_data.iterrows(): + if r[1]['label1'] == 1: + plt.axvline(x=r[1]['index'], linewidth=0.3, alpha=0.3, color='g') + else: + plt.axvline(x=r[1]['index'], linewidth=0.3, alpha=0.3, color='r') + +plt.show() + +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_31_0.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_31_1.png) + + +###### Percentage change of each indicator : (xn - xn-1)/xn-1 where n = [n, n+y, n+2y] and y = Time Frame Window selected + + +```python +#Normalizing the features datapoints +#Accordingly to its window frame + +#Each datapoint to the change percentage of timeframe +def percent_change(dataframe, wf): + new = pd.DataFrame() + swf = str(wf) + for feature in dataframe: + if 'label' in str(dataframe[feature].name): + pass + elif 'change_' in str(dataframe[feature].name): + pass + else: + dataframe['change_' + str(dataframe[feature].name)] = \ + (dataframe[feature] - dataframe[feature].shift(wf))/dataframe[feature].shift(wf) + new['change_' + str(dataframe[feature].name)] = \ + (dataframe[feature] - dataframe[feature].shift(wf))/dataframe[feature].shift(wf) + return dataframe, new + +raw_data = data.copy() +data, percent_change_data = percent_change(data, 1) +data = data.drop('change_pdm', 1) +data = data.drop('change_um', 1) +data = data.drop('change_dm', 1) +percent_change_data = percent_change_data.drop('change_pdm', 1) +percent_change_data = percent_change_data.drop('change_um', 1) +percent_change_data = percent_change_data.drop('change_dm', 1) +percent_change_data = percent_change_data.replace([np.inf, -np.inf], np.nan) +percent_change_data = percent_change_data.fillna(method='bfill') +data = data.replace([np.inf, -np.inf], np.nan) +data = data.fillna(method='bfill') +data.plot(x=data.index, y='change_close_20_sma', figsize=(15,10)) +data.plot(x=data.index, y=['change_kdjk','change_adx', 'change_close_20_sma'], figsize=(15,10)) + +display(data.tail()) +display(percent_change_data.tail()) +plot_histogram_dv(data['change_macdh'], y1) +plot_histogram_dv(data['change_macdh'], c15) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseclose_20_smaclose_20_mstdbollboll_ubboll_lbclose_-1_s...change_mdi_14change_mdichange_dx_14change_dxchange_dx_6_emachange_adxchange_adx_6_emachange_adxrchange_trixchange_trix_9_sma
date
2018-01-013.30753.31173.30753.30763.2968550.0294853.2968553.3558253.2378853.3111...-0.073114-0.073114-6.225712e-16-6.225712e-160.0471430.047143-0.009660-0.009660-0.0345420.016791
2018-01-023.31083.31273.25853.31103.3002750.0266963.3002753.3536663.2468843.3076...0.6955120.6955121.310292e+001.310292e+000.4486620.4486620.1180960.118096-0.0445110.007815
2018-01-033.25743.26383.24103.25783.3011500.0248493.3011503.3508493.2514513.3110...-0.015868-0.0158681.234280e-011.234280e-010.2837900.2837900.1779380.177938-0.126147-0.007375
2018-01-043.23563.24103.22143.23553.3012100.0246803.3012103.3505713.2518493.2578...0.0663330.0663331.039332e-011.039332e-010.2040030.2040030.1881970.188197-0.228872-0.030493
2018-01-053.23283.24793.22563.23313.2985050.0289013.2985053.3563063.2407043.2355...-0.105324-0.105324-1.462284e-01-1.462284e-010.0615500.0615500.1376840.137684-0.352545-0.063682
+

5 rows × 129 columns

+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
change_openchange_highchange_lowchange_closechange_close_20_smachange_close_20_mstdchange_bollchange_boll_ubchange_boll_lbchange_close_-1_s...change_mdi_14change_mdichange_dx_14change_dxchange_dx_6_emachange_adxchange_adx_6_emachange_adxrchange_trixchange_trix_9_sma
date
2018-01-01-0.001057-0.0001510.007770-0.0010570.000762-0.0375820.0007620.0000620.001489-0.000664...-0.073114-0.073114-6.225712e-16-6.225712e-160.0471430.047143-0.009660-0.009660-0.0345420.016791
2018-01-020.0009980.000302-0.0148150.0010280.001037-0.0946020.001037-0.0006430.002779-0.001057...0.6955120.6955121.310292e+001.310292e+000.4486620.4486620.1180960.118096-0.0445110.007815
2018-01-03-0.016129-0.014761-0.005371-0.0160680.000265-0.0691610.000265-0.0008400.0014070.001028...-0.015868-0.0158681.234280e-011.234280e-010.2837900.2837900.1779380.177938-0.126147-0.007375
2018-01-04-0.006692-0.006986-0.006048-0.0068450.000018-0.0068020.000018-0.0000830.000122-0.016068...0.0663330.0663331.039332e-011.039332e-010.2040030.2040030.1881970.188197-0.228872-0.030493
2018-01-05-0.0008650.0021290.001304-0.000742-0.0008190.170995-0.0008190.001712-0.003427-0.006845...-0.105324-0.105324-1.462284e-01-1.462284e-010.0615500.0615500.1376840.137684-0.352545-0.063682
+

5 rows × 63 columns

+
+ + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_33_2.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_33_3.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_33_4.png) + + +#### We see in the above picture that even with the percent change ratio we cant diferentiate on how much that change was significant by some orders of magnitude + +###### Standardized change range : ((xn - xn-1)/xn-1) / (xMax - xMin) + + +```python +#How abnormal was the change compared to the feature range +def normalized_range(dataframe, wf): + swf = str(wf) + new = pd.DataFrame() + for feature in dataframe: + if 'label' in str(dataframe[feature].name): + pass + elif 'change_' in str(dataframe[feature].name): + pass + elif 'rchange_' in str(dataframe[feature].name): + pass + else: + try: + range = dataframe['change_' + str(dataframe[feature].name)].max() - \ + dataframe['change_' + str(dataframe[feature].name)].min() + dataframe['rchange_' + str(dataframe[feature].name)] = \ + dataframe['change_' + str(dataframe[feature].name)] / range + new['rchange_' + str(dataframe[feature].name)] = \ + dataframe['change_' + str(dataframe[feature].name)] / range + except: + pass + return dataframe, new + + +change_data = data.copy() +data, normalized_range_data = normalized_range(data, 1) +data.plot(x=data.index, y=['rchange_close_20_sma','rchange_adx', 'rchange_close'], figsize=(15,10)) +data = data.replace([np.inf, -np.inf], np.nan) +data = data.fillna(method='bfill') +normalized_range_data = normalized_range_data.replace([np.inf, -np.inf], np.nan) +normalized_range_data = normalized_range_data.fillna(method='bfill') + + +display(data.tail()) +display(normalized_range_data.tail()) +plot_histogram_dv(normalized_range_data['rchange_rsi_6'], y1) +plot_histogram_dv(normalized_range_data['rchange_rsi_6'], c15) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseclose_20_smaclose_20_mstdbollboll_ubboll_lbclose_-1_s...rchange_mdi_14rchange_mdirchange_dx_14rchange_dxrchange_dx_6_emarchange_adxrchange_adx_6_emarchange_adxrrchange_trixrchange_trix_9_sma
date
2018-01-013.30753.31173.30753.30763.2968550.0294853.2968553.3558253.2378853.3111...-0.010609-0.010609-2.072037e-19-2.072037e-190.0124960.012496-0.015652-0.015652-0.0002040.000091
2018-01-023.31083.31273.25853.31103.3002750.0266963.3002753.3536663.2468843.3076...0.1009170.1009174.360903e-044.360903e-040.1189260.1189260.1913460.191346-0.0002630.000042
2018-01-033.25743.26383.24103.25783.3011500.0248493.3011503.3508493.2514513.3110...-0.002302-0.0023024.107921e-054.107921e-050.0752240.0752240.2883050.288305-0.000745-0.000040
2018-01-043.23563.24103.22143.23553.3012100.0246803.3012103.3505713.2518493.2578...0.0096250.0096253.459096e-053.459096e-050.0540750.0540750.3049280.304928-0.001352-0.000165
2018-01-053.23283.24793.22563.23313.2985050.0289013.2985053.3563063.2407043.2355...-0.015282-0.015282-4.866762e-05-4.866762e-050.0163150.0163150.2230840.223084-0.002083-0.000345
+

5 rows × 192 columns

+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
rchange_openrchange_highrchange_lowrchange_closerchange_close_20_smarchange_close_20_mstdrchange_bollrchange_boll_ubrchange_boll_lbrchange_close_-1_s...rchange_mdi_14rchange_mdirchange_dx_14rchange_dxrchange_dx_6_emarchange_adxrchange_adx_6_emarchange_adxrrchange_trixrchange_trix_9_sma
date
2018-01-01-0.006705-0.0009150.065792-0.0077160.055369-0.0227550.0553690.0013210.041792-0.004847...-0.010609-0.010609-2.072037e-19-2.072037e-190.0124960.012496-0.015652-0.015652-0.0002040.000091
2018-01-020.0063290.001830-0.1254500.0075030.075386-0.0572780.075386-0.0137630.078024-0.007716...0.1009170.1009174.360903e-044.360903e-040.1189260.1189260.1913460.191346-0.0002630.000042
2018-01-03-0.102309-0.089450-0.045477-0.1172840.019267-0.0418740.019267-0.0179750.0394940.007503...-0.002302-0.0023024.107921e-054.107921e-050.0752240.0752240.2883050.288305-0.000745-0.000040
2018-01-04-0.042451-0.042332-0.051209-0.0499650.001321-0.0041190.001321-0.0017750.003437-0.117284...0.0096250.0096253.459096e-053.459096e-050.0540750.0540750.3049280.304928-0.001352-0.000165
2018-01-05-0.0054890.0129010.011040-0.005414-0.0595470.103531-0.0595470.036623-0.096222-0.049965...-0.015282-0.015282-4.866762e-05-4.866762e-050.0163150.0163150.2230840.223084-0.002083-0.000345
+

5 rows × 63 columns

+
+ + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_36_2.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_36_3.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_36_4.png) + + +#### As we can see, the datapoints are now expressing in a much more intuiteve manner their movements with a same axis of change + +###### Normalized change rate : ( ( (xn - xn-1)/xn-1 ) - (Σxi / n) ) / ( √( (Σxi - (Σxi / n)ˆ2 ) / n ) ) = (Change - Mean) / Standard Deviation + + +```python +#How abnormal was this change percentage ratio in comparison to the others +def normalized_change(dataframe, wf): + swf = str(wf) + new = pd.DataFrame() + for feature in dataframe: + if 'label' in str(dataframe[feature].name): + pass + elif 'change_' in str(dataframe[feature].name): + pass + elif 'rchange_' in str(dataframe[feature].name): + pass + elif 'nchange_' in str(dataframe[feature].name): + pass + else: + try: + std = dataframe['change_' + str(dataframe[feature].name)].std() + mean = dataframe['change_' + str(dataframe[feature].name)].mean() + dataframe['nchange_' + str(dataframe[feature].name)] = \ + (dataframe['change_' + str(dataframe[feature].name)] - mean)/std + new['nchange_' + str(dataframe[feature].name)] = \ + (dataframe['change_' + str(dataframe[feature].name)] - mean)/std + except: + pass + + return dataframe, new + +rchange_data = data.copy() +data, normalized_change_data = normalized_change(data, 1) +data = data.replace([np.inf, -np.inf], np.nan) +data = data.fillna(method='bfill') +normalized_change_data = normalized_change_data.replace([np.inf, -np.inf], np.nan) +normalized_change_data = normalized_change_data.fillna(method='bfill') +data.plot(x=data.index, y=['nchange_close_20_sma','nchange_adx', 'nchange_close'], figsize=(15, 10)) + +display(data.tail()) +display(normalized_change_data.tail()) + +plot_histogram_dv(normalized_change_data['nchange_rsi_6'], y1) +plot_histogram_dv(normalized_change_data['nchange_rsi_6'], c15) + +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseclose_20_smaclose_20_mstdbollboll_ubboll_lbclose_-1_s...nchange_mdi_14nchange_mdinchange_dx_14nchange_dxnchange_dx_6_emanchange_adxnchange_adx_6_emanchange_adxrnchange_trixnchange_trix_9_sma
date
2018-01-013.30753.31173.30753.30763.2968550.0294853.2968553.3558253.2378853.3111...-0.277020-0.277020-0.045644-0.0456440.1346910.134691-0.148862-0.1488620.0077740.020925
2018-01-023.31083.31273.25853.31103.3002750.0266963.3002753.3536663.2468843.3076...1.6123891.612389-0.026412-0.0264122.0339342.0339341.2497291.2497290.0047290.018454
2018-01-033.25743.26383.24103.25783.3011500.0248493.3011503.3508493.2514513.3110...-0.136300-0.136300-0.043832-0.0438321.2540661.2540661.9048401.904840-0.0202090.014271
2018-01-043.23563.24103.22143.23553.3012100.0246803.3012103.3505713.2518493.2578...0.0657630.065763-0.044118-0.0441180.8766650.8766652.0171572.017157-0.0515890.007907
2018-01-053.23283.24793.22563.23313.2985050.0289013.2985053.3563063.2407043.2355...-0.356199-0.356199-0.047790-0.0477900.2028390.2028391.4641681.464168-0.089369-0.001231
+

5 rows × 255 columns

+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
nchange_opennchange_highnchange_lownchange_closenchange_close_20_smanchange_close_20_mstdnchange_bollnchange_boll_ubnchange_boll_lbnchange_close_-1_s...nchange_mdi_14nchange_mdinchange_dx_14nchange_dxnchange_dx_6_emanchange_adxnchange_adx_6_emanchange_adxrnchange_trixnchange_trix_9_sma
date
2018-01-01-0.128347-0.0500570.821339-0.1272590.229905-0.4418050.229905-0.0718890.362274-0.089153...-0.277020-0.277020-0.045644-0.0456440.1346910.134691-0.148862-0.1488620.0077740.020925
2018-01-020.061175-0.003072-1.6767150.0597010.368936-1.0414220.368936-0.2799760.760588-0.124398...1.6123891.612389-0.026412-0.0264122.0339342.0339341.2497291.2497290.0047290.018454
2018-01-03-1.518482-1.565728-0.632093-1.473256-0.020851-0.773884-0.020851-0.3380860.3370150.062559...-0.136300-0.136300-0.043832-0.0438321.2540661.2540661.9048401.904840-0.0202090.014271
2018-01-04-0.648116-0.759089-0.706970-0.646273-0.145504-0.118126-0.145504-0.114609-0.059371-1.470369...0.0657630.065763-0.044118-0.0441180.8766650.8766652.0171572.017157-0.0515890.007907
2018-01-05-0.1106650.1864610.106153-0.098988-0.5682771.751577-0.5682770.415113-1.154962-0.643402...-0.356199-0.356199-0.047790-0.0477900.2028390.2028391.4641681.464168-0.089369-0.001231
+

5 rows × 63 columns

+
+ + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_39_2.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_39_3.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_39_4.png) + + +###### And now, we can evaluate the order of the anomaly of a certain datapoint without losing information on the feature + +###### Normalizing the raw features instead of the change rate + + +```python +#How abnormal is the position that the datapoint is located at +#We substitute the original feature value for this one +def distance(dataframe): + new = pd.DataFrame() + for feature in dataframe: + if 'label' in str(dataframe[feature].name): + pass + elif 'change_' in str(dataframe[feature].name): + pass + elif 'nchange_' in str(dataframe[feature].name): + pass + elif 'rchange_' in str(dataframe[feature].name): + pass + elif 'distance_' in str(dataframe[feature].name): + pass + else: + std = dataframe[feature].std() + mean = dataframe[feature].mean() + dataframe['distance_' + str(dataframe[feature].name)] = (dataframe[feature] - mean)/std + new['distance_' + str(dataframe[feature].name)] = (dataframe[feature] - mean)/std + return dataframe, new + +nchange = data.copy() +data, distance_data = distance(data) +data = data.replace([np.inf, -np.inf], np.nan) +data = data.fillna(method='bfill') +distance_data = distance_data.replace([np.inf, -np.inf], np.nan) +distance_data = distance_data.fillna(method='bfill') +data.plot(x=data.index, y=['distance_close_20_sma','distance_adx', 'close_20_sma'], figsize=(15,10)) + + +display(data.tail()) +display(distance_data.tail()) + +plot_histogram_dv(distance_data['distance_macdh'], y1) +plot_histogram_dv(data['macdh'], y1) +plot_histogram_dv(distance_data['distance_macdh'], c15) +plot_histogram_dv(data['macdh'], c15) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseclose_20_smaclose_20_mstdbollboll_ubboll_lbclose_-1_s...distance_mdi_14distance_mdidistance_dx_14distance_dxdistance_dx_6_emadistance_adxdistance_adx_6_emadistance_adxrdistance_trixdistance_trix_9_sma
date
2018-01-013.30753.31173.30753.30763.2968550.0294853.2968553.3558253.2378853.3111...0.4889660.488966-0.348806-0.348806-0.616822-0.616822-0.665492-0.6654920.3381690.335080
2018-01-023.31083.31273.25853.31103.3002750.0266963.3002753.3536663.2468843.3076...2.3029472.3029471.0643751.064375-0.031028-0.031028-0.485673-0.4856730.3144580.339260
2018-01-033.25743.26383.24103.25783.3011500.0248493.3011503.3508493.2514513.3110...2.2327782.2327781.3719211.3719210.5057430.505743-0.182739-0.1827390.2502500.335284
2018-01-043.23563.24103.22143.23553.3012100.0246803.3012103.3505713.2518493.2578...2.5214542.5214541.6628561.6628561.0011061.0011060.1946730.1946730.1484510.318967
2018-01-053.23283.24793.22563.23313.2985050.0289013.2985053.3563063.2407043.2355...2.0326852.0326851.2109831.2109831.1810511.1810510.5227490.5227490.0275330.285929
+

5 rows × 321 columns

+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
distance_opendistance_highdistance_lowdistance_closedistance_close_20_smadistance_close_20_mstddistance_bolldistance_boll_ubdistance_boll_lbdistance_close_-1_s...distance_mdi_14distance_mdidistance_dx_14distance_dxdistance_dx_6_emadistance_adxdistance_adx_6_emadistance_adxrdistance_trixdistance_trix_9_sma
date
2018-01-011.1293461.1051201.1619481.1309771.128932-0.3264461.1289321.0569271.2036221.136831...0.4889660.488966-0.348806-0.348806-0.616822-0.616822-0.665492-0.6654920.3381690.335080
2018-01-021.1338881.1064951.0938901.1356961.133691-0.4333231.1336911.0540541.2167001.131972...2.3029472.3029471.0643751.064375-0.031028-0.031028-0.485673-0.4856730.3144580.339260
2018-01-031.0603851.0392241.0695841.0618541.134909-0.5040661.1349091.0503041.2233381.136692...2.2327782.2327781.3719211.3719210.5057430.505743-0.182739-0.1827390.2502500.335284
2018-01-041.0303771.0078581.0423611.0309021.134993-0.5105431.1349931.0499341.2239161.062847...2.5214542.5214541.6628561.6628561.0011061.0011060.1946730.1946730.1484510.318967
2018-01-051.0265231.0173501.0481941.0275701.131228-0.3488401.1312281.0575671.2077191.031893...2.0326852.0326851.2109831.2109831.1810511.1810510.5227490.5227490.0275330.285929
+

5 rows × 66 columns

+
+ + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_42_2.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_42_3.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_42_4.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_42_5.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_42_6.png) + + +#### And evaluate the distance of the datapoint address in comparison to the others of its kind + +###### Creating new features via interactions between them + + +```python +from itertools import combinations +from sklearn.preprocessing import PolynomialFeatures + +def add_interactions(df): + # Get feature names + combos = list(combinations(list(df.columns), 2)) + colnames = list(df.columns) + ['_'.join(x) for x in combos] + + # Find interactions + poly = PolynomialFeatures(interaction_only=True, include_bias=False) + df = poly.fit_transform(df) + df = pd.DataFrame(df) + df.columns = colnames + + # Remove interaction terms with all 0 values + noint_indicies = [i for i, x in enumerate(list((df == 0).all())) if x] + df = df.drop(df.columns[noint_indicies], axis=1) + + return df +``` + + +```python +teste = add_interactions(data.copy()) +print (teste.head(5)) +``` + + open high low close close_20_sma close_20_mstd boll \ + 0 1.8189 1.8213 1.7655 1.7835 1.758200 0.028923 1.758200 + 1 1.8136 1.8460 1.8129 1.8336 1.762635 0.033447 1.762635 + 2 1.7860 1.8663 1.7860 1.8496 1.767467 0.038380 1.767467 + 3 1.8064 1.8712 1.8064 1.8680 1.772758 0.043854 1.772758 + 4 1.8200 1.8729 1.8200 1.8729 1.777765 0.048201 1.777765 + + boll_ub boll_lb close_-1_s ... \ + 0 1.816046 1.700354 1.8188 ... + 1 1.829528 1.695742 1.7835 ... + 2 1.844227 1.690707 1.8336 ... + 3 1.860465 1.685051 1.8496 ... + 4 1.874167 1.681363 1.8680 ... + + distance_adx_distance_adx_6_ema distance_adx_distance_adxr \ + 0 2.155962 2.155962 + 1 3.204561 3.204561 + 2 2.238586 2.238586 + 3 1.551822 1.551822 + 4 1.090493 1.090493 + + distance_adx_distance_trix distance_adx_distance_trix_9_sma \ + 0 1.237139 0.663483 + 1 1.774381 1.013457 + 2 1.445436 0.852129 + 3 1.260181 0.754362 + 4 1.133128 0.694455 + + distance_adx_6_ema_distance_adxr distance_adx_6_ema_distance_trix \ + 0 1.881588 1.079697 + 1 2.633112 1.457967 + 2 2.594067 1.674967 + 3 2.216411 1.799871 + 4 1.767651 1.836761 + + distance_adx_6_ema_distance_trix_9_sma distance_adxr_distance_trix \ + 0 0.579046 1.079697 + 1 0.832734 1.457967 + 2 0.987445 1.674967 + 3 1.077429 1.799871 + 4 1.125687 1.836761 + + distance_adxr_distance_trix_9_sma distance_trix_distance_trix_9_sma + 0 0.579046 0.332270 + 1 0.832734 0.461089 + 2 0.987445 0.637585 + 3 1.077429 0.874942 + 4 1.125687 1.169698 + + [5 rows x 51681 columns] + + +## Feature Selection + +###### The methods based on F-test estimate the degree of linear dependency between two random variables. On the other hand, mutual information methods can capture any kind of statistical dependency, but being nonparametric, they require more samples for accurate estimation. + + +```python +import numpy as np +from sklearn.feature_selection import f_classif, mutual_info_classif + +y_15 = c15[15:-15] +y_1 = c1[15:-15] +y_5 = y_5[15:-15] +y_30 = y_30[15:-15] +mi = mutual_info_regression(distance_data, y_15, discrete_features='auto') +#print test.columns +mi /= np.max(mi) +result = distance_data.columns[mi > 0.1] +miresult = result +mi = mi[mi > 0.1] +print len(result) +display(result) +mi_df = pd.DataFrame(index=result, columns=['value']) +mi_df['value'] = mi +mi_df.plot(figsize=(15,10)) +display(mi_df.head()) +print mi_df + +print "\n" + +ftest, _ = f_regression(distance_data, y_15) +ftest /= np.max(ftest) +_[np.isnan(_)] = 0.0 +f = _[~np.isnan(_)] +result = distance_data.columns[f > 0.1] +f = f[f > 0.1] +#print f.max() +#print result.max() +print len(result) +print result + +f_df = pd.DataFrame(index=result, columns=['value']) +f_df['value'] = f +f_df.plot(figsize=(15,10)) +display(f_df.head()) +print f_df + +equal = [] + +for i in miresult.values: + if i in result.values: + equal.append(i) + +print "\n" +display(equal) + + +``` + + 29 + + + + Index([u'distance_open', u'distance_high', u'distance_low', u'distance_close', + u'distance_close_20_sma', u'distance_close_20_mstd', u'distance_boll', + u'distance_boll_ub', u'distance_boll_lb', u'distance_close_-1_s', + u'distance_close_26_ema', u'distance_macd', u'distance_middle', + u'distance_cr-ma1', u'distance_cr-ma3', u'distance_open_2_sma', + u'distance_middle_14_sma', u'distance_middle_20_sma', u'distance_atr', + u'distance_close_10_sma', u'distance_close_50_sma', u'distance_dma', + u'distance_atr_14', u'distance_dx_14', u'distance_dx', + u'distance_dx_6_ema', u'distance_adx', u'distance_trix', + u'distance_trix_9_sma'], + dtype='object') + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
value
distance_open0.440642
distance_high0.443556
distance_low0.505598
distance_close0.468534
distance_close_20_sma0.491667
+
+ + + value + distance_open 0.440642 + distance_high 0.443556 + distance_low 0.505598 + distance_close 0.468534 + distance_close_20_sma 0.491667 + distance_close_20_mstd 0.217032 + distance_boll 0.494343 + distance_boll_ub 0.829823 + distance_boll_lb 0.555011 + distance_close_-1_s 0.442161 + distance_close_26_ema 0.729244 + distance_macd 0.168234 + distance_middle 0.637619 + distance_cr-ma1 0.207764 + distance_cr-ma3 0.198476 + distance_open_2_sma 0.450697 + distance_middle_14_sma 0.642620 + distance_middle_20_sma 0.506292 + distance_atr 0.241409 + distance_close_10_sma 0.624836 + distance_close_50_sma 1.000000 + distance_dma 0.172680 + distance_atr_14 0.246042 + distance_dx_14 0.185833 + distance_dx 0.173521 + distance_dx_6_ema 0.113376 + distance_adx 0.113376 + distance_trix 0.319277 + distance_trix_9_sma 0.260197 + + + 24 + Index([u'distance_open', u'distance_high', u'distance_low', u'distance_close', + u'distance_boll_lb', u'distance_close_-1_s', u'distance_close_-1_d', + u'distance_close_-1_r', u'distance_middle', u'distance_cr-ma3', + u'distance_rsv_9', u'distance_kdjk_9', u'distance_kdjk', + u'distance_kdjj_9', u'distance_kdjj', u'distance_open_2_sma', + u'distance_wr_10', u'distance_middle_14_sma', u'distance_close_10_sma', + u'distance_pdm_14_ema', u'distance_pdm_14', u'distance_adx_6_ema', + u'distance_adxr', u'distance_trix_9_sma'], + dtype='object') + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
value
distance_open0.191533
distance_high0.181462
distance_low0.210108
distance_close0.138125
distance_boll_lb0.141074
+
+ + + value + distance_open 0.191533 + distance_high 0.181462 + distance_low 0.210108 + distance_close 0.138125 + distance_boll_lb 0.141074 + distance_close_-1_s 0.141206 + distance_close_-1_d 0.740016 + distance_close_-1_r 0.530851 + distance_middle 0.174595 + distance_cr-ma3 0.211435 + distance_rsv_9 0.249812 + distance_kdjk_9 0.276445 + distance_kdjk 0.276445 + distance_kdjj_9 0.714550 + distance_kdjj 0.714550 + distance_open_2_sma 0.184072 + distance_wr_10 0.488122 + distance_middle_14_sma 0.110842 + distance_close_10_sma 0.116276 + distance_pdm_14_ema 0.299721 + distance_pdm_14 0.299721 + distance_adx_6_ema 0.506360 + distance_adxr 0.506360 + distance_trix_9_sma 0.250674 + + + + + + ['distance_open', + 'distance_high', + 'distance_low', + 'distance_close', + 'distance_boll_lb', + 'distance_close_-1_s', + 'distance_middle', + 'distance_cr-ma3', + 'distance_open_2_sma', + 'distance_middle_14_sma', + 'distance_close_10_sma', + 'distance_trix_9_sma'] + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_49_7.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_49_8.png) + + + +```python +from sklearn.decomposition import PCA + + + +pca = PCA(n_components=2) +data_pca = pd.DataFrame(pca.fit_transform(distance_data)) +#display(data_pca.head()) +data_pca.plot(figsize=(15,10)) + +datatest = pca.fit_transform(distance_data) +plt.figure(num=None, figsize=(18, 11), dpi=80, facecolor='w', edgecolor='k') +plt.scatter(datatest[:, 0], datatest[:, 1]) +plt.show() +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_50_0.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_50_1.png) + + +###### T - Distributed Stochastic Neighboor Embedding +Transforming the data into a Similarity Matrix for comparing the similarity of a certain datapoint with the rest + + +```python +from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import StandardScaler +from sklearn.cross_validation import train_test_split +from sklearn.metrics import accuracy_score + +# t-distributed Stochastic Neighbor Embedding (t-SNE) visualization +from sklearn.manifold import TSNE +tsne = TSNE(n_components=2, random_state=0) +x_test_2d = tsne.fit_transform(distance_data) +#y_test = y_15 + +y_tsne = [] +for key, i in np.ndenumerate(y_15): + if i == 0: + if y_1[key[0]] == 0: + y_tsne.append(0) + elif y_1[key[0]] == 1: + y_tsne.append(1) + if i == 1: + if y_1[key[0]] == 0: + y_tsne.append(2) + elif y_1[key[0]] == 1: + y_tsne.append(3) + +y_test = np.array(y_tsne) + + +markers=('s', 'd', 'o', '^', 'v') +color_map = {0:'red', 1:'blue', 2:'lightgreen', 3:'purple'} +plt.figure(figsize=(15,10)) +for idx, cl in enumerate(np.unique(y_test)): + plt.scatter(x=x_test_2d[y_test==cl,0], y=x_test_2d[y_test==cl,1], c=color_map[idx], marker=markers[idx], label=cl, alpha=0.5) +plt.xlabel('X in t-SNE') +plt.ylabel('Y in t-SNE') +plt.legend(loc='upper left') +plt.title('t-SNE visualization of test data') +plt.show() + + +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_52_0.png) + + +# Facebook Time series forecasting + +###### Prophet library + + +```python +from fbprophet import Prophet +import numpy as np + +test = data.copy() +test['ds'] = data.index +test['y'] = np.log(data['close']) +display(test.tail()) +m = Prophet() +m.fit(test) +future = m.make_future_dataframe(periods=365) +forecast = m.predict(future) +forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']] +m.plot(forecast) +m.plot_components(forecast) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseadj closevolumedsy
Date
2018-02-053.21693.24553.21383.21543.21540.02018-02-051.167952
2018-02-063.26113.27593.21753.26113.26110.02018-02-061.182065
2018-02-073.23333.26303.23143.23343.23340.02018-02-071.173534
2018-02-083.26963.29263.25623.26993.26990.02018-02-081.184759
2018-02-093.28443.30753.27083.28463.28460.02018-02-091.189245
+
+ + + INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. + /Library/Python/2.7/site-packages/pystan/misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. + elif np.issubdtype(np.asarray(v).dtype, float): + + + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_55_2.png) + + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_55_3.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_55_4.png) + diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_13_1.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_13_1.png new file mode 100644 index 0000000..9a1ce4e Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_13_1.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_18_0.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_18_0.png new file mode 100644 index 0000000..df058f3 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_18_0.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_18_1.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_18_1.png new file mode 100644 index 0000000..86c1a3a Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_18_1.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_20_0.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_20_0.png new file mode 100644 index 0000000..12d45f5 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_20_0.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_25_0.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_25_0.png new file mode 100644 index 0000000..c742c3f Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_25_0.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_25_1.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_25_1.png new file mode 100644 index 0000000..7bbcb19 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_25_1.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_25_2.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_25_2.png new file mode 100644 index 0000000..0ad95a9 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_25_2.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_25_3.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_25_3.png new file mode 100644 index 0000000..fb217a5 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_25_3.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_28_1.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_28_1.png new file mode 100644 index 0000000..bd99de1 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_28_1.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_2_2.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_2_2.png new file mode 100644 index 0000000..cac53d1 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_2_2.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_31_0.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_31_0.png new file mode 100644 index 0000000..faac524 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_31_0.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_31_1.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_31_1.png new file mode 100644 index 0000000..052e80b Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_31_1.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_33_2.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_33_2.png new file mode 100644 index 0000000..8d7bc85 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_33_2.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_33_3.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_33_3.png new file mode 100644 index 0000000..72616b2 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_33_3.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_33_4.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_33_4.png new file mode 100644 index 0000000..4f62ff3 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_33_4.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_36_2.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_36_2.png new file mode 100644 index 0000000..888b9f9 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_36_2.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_36_3.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_36_3.png new file mode 100644 index 0000000..02b690b Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_36_3.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_36_4.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_36_4.png new file mode 100644 index 0000000..c2efda5 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_36_4.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_39_2.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_39_2.png new file mode 100644 index 0000000..da372a1 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_39_2.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_39_3.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_39_3.png new file mode 100644 index 0000000..00965c4 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_39_3.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_39_4.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_39_4.png new file mode 100644 index 0000000..62d2522 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_39_4.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_2.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_2.png new file mode 100644 index 0000000..f51484a Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_2.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_3.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_3.png new file mode 100644 index 0000000..95d975f Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_3.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_4.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_4.png new file mode 100644 index 0000000..c742c3f Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_4.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_5.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_5.png new file mode 100644 index 0000000..acd36e5 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_5.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_6.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_6.png new file mode 100644 index 0000000..d7a2dec Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_42_6.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_49_7.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_49_7.png new file mode 100644 index 0000000..b609b13 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_49_7.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_49_8.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_49_8.png new file mode 100644 index 0000000..1774cf2 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_49_8.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_50_0.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_50_0.png new file mode 100644 index 0000000..0cf8052 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_50_0.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_50_1.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_50_1.png new file mode 100644 index 0000000..e68833c Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_50_1.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_52_0.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_52_0.png new file mode 100644 index 0000000..b1d71d3 Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_52_0.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_55_2.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_55_2.png new file mode 100644 index 0000000..dd3511e Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_55_2.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_55_3.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_55_3.png new file mode 100644 index 0000000..b1b10ed Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_55_3.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_55_4.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_55_4.png new file mode 100644 index 0000000..dd3511e Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_55_4.png differ diff --git a/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_6_0.png b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_6_0.png new file mode 100644 index 0000000..6b23edd Binary files /dev/null and b/Dataset modeling for financial time series data_files/Dataset modeling for financial time series data_6_0.png differ diff --git a/README.md b/README.md old mode 100755 new mode 100644 index 8170ef9..dfa8b0d --- a/README.md +++ b/README.md @@ -1,5 +1,3545 @@ -# Kamaji -#### Documentation provided in IPython Notebook - Algar Kamaji Documentation - [Data prep, cleaning and tests results](https://github.com/sudoFerraz/Kamaji/blob/master/Dataset%20modeling%20for%20financial%20time%20series%20data.ipynb) - - #### This is a full fledged application with a Flask Backend serving the model results + +# Dataset modeling for Financial Time Series Data +This document aims to provide information on the research related to find the best format to represent financial time series data with certain data analysis for the usage of machine learning techniques + +## On the data provided - Overview + + +```python +%matplotlib inline + +import pandas as pd +import pandas_datareader as web +from IPython.core.display import display +import matplotlib.pylab as plt +from stockstats import StockDataFrame +import seaborn as sns +sns.set() + +df = web.DataReader('BRL=X', 'yahoo') +data = pd.DataFrame(df) +data = StockDataFrame.retype(data) +display(data.head()) +data.plot(figsize=(15,10)) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseadj closevolume
Date
2010-01-041.69301.74121.67231.71901.71900.0
2010-01-051.67131.73701.67131.73701.73700.0
2010-01-061.67981.73591.67981.73151.73150.0
2010-01-071.72421.74721.68051.73891.73890.0
2010-01-081.69541.74921.69541.73201.73200.0
+
+ + + + + + + + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_2_2.png) + + +## On the indicators + + +```python +%matplotlib inline + +import pandas as pd +import pandas_datareader as web +from IPython.core.display import display +import matplotlib.pylab as plt +from stockstats import StockDataFrame +import seaborn as sns +sns.set() + +data = pd.read_csv('USDBRL/all_indicators.csv') +data = StockDataFrame.retype(data) +copy = data.copy() +display(data.tail()) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseadj closevolumeclose_20_smaclose_20_mstdbollboll_ub...mdi_14mdidx_14dxdx_6_emaadxadx_6_emaadxrtrixtrix_9_sma
date
2018-01-223.19123.20633.18283.19473.19470.03.251310.0453473.251313.342003...32.42446432.42446450.39382650.39382644.70556244.70556246.14526246.145262-0.104079-0.070007
2018-01-233.20073.23643.19863.20073.20070.03.244570.0420743.244573.328719...27.45617127.45617112.09310812.09310835.38771835.38771843.07167843.071678-0.108291-0.079818
2018-01-243.23373.23823.17573.23553.23550.03.240860.0392023.240863.319265...31.17443031.17443028.15480828.15480833.32117233.32117240.28581940.285819-0.107148-0.087835
2018-01-253.14513.14843.12153.14513.14510.03.232450.0408513.232453.314153...41.19458041.19458052.07050952.07050938.67812638.67812639.82647839.826478-0.112533-0.094800
2018-01-263.14543.15433.13123.14693.14690.03.224240.0407123.224243.305665...36.82179636.82179645.96752445.96752440.76081140.76081140.09343040.093430-0.120949-0.101018
+

5 rows × 69 columns

+
+ + +## Handling missing data (Data Cleaning) + + +```python +#How much of the data is missing +counter_nan = data.isnull().sum().sort_values(ascending=False) +plt.figure(figsize=(15,10)) +plt.scatter(counter_nan, counter_nan.values) +plt.show() +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_6_0.png) + + + +```python +#how many columns does not have a single nan +counter_without_nan = counter_nan[counter_nan==0] +print " [+] Number of columns that does not have a nan: " + str(len(counter_without_nan)) +print " [+] Number of total columns: " + str(len(data.columns)) +``` + + [+] Number of columns that does not have a nan: 24 + [+] Number of total columns: 69 + + +###### Much of the encountered NaN are caused from the indicators necessity for previous data + + +```python +display(data[counter_nan.keys()].head()) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
cci_20ccitrhigh_deltaumlow_deltadmclose_-1_dcr-ma3close_-1_s...kdjk_9close_10_smamacdsclose_50_smadmapdmpdm_14_emapdm_14macdhmacd
date
2010-01-04NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...55.9264631.7190000.0000001.7190000.00.00000.0000000.0000000.0000000.000000
2010-01-0566.66666766.6666670.0657-0.00420.0000-0.00100.0010.0180NaN1.7190...68.6147811.7280000.0002241.7280000.00.00000.0000000.0000000.0003590.000404
2010-01-0660.36363660.3636360.0572-0.00110.00000.00850.000-0.0055NaN1.7370...74.4508651.7291670.0002731.7291670.00.00000.0000000.0000000.0001410.000344
2010-01-07133.333333133.3333330.06670.01130.01130.00070.0000.0074NaN1.7315...79.3220961.7316000.0003761.7316000.00.01130.0034570.0034570.0004000.000576
2010-01-08106.533036106.5330360.05380.00200.00200.01490.000-0.0069NaN1.7389...78.8548681.7316800.0003871.7316800.00.00200.0030770.0030770.0000550.000415
+

5 rows × 69 columns

+
+ + +###### Erasing equal or all zero columns + + +```python +from pandas.util.testing import assert_series_equal +import numpy as np + +# Taking out columns that have all values as 0 or equal values +data = StockDataFrame.retype(data) +cols = data.select_dtypes([np.number]).columns +diff = data[cols].diff().sum() + +data = data.drop(diff[diff==0].index, axis=1) +data = data.drop('adj close', 1) +display(data.tail()) + +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseclose_20_smaclose_20_mstdbollboll_ubboll_lbclose_-1_s...mdi_14mdidx_14dxdx_6_emaadxadx_6_emaadxrtrixtrix_9_sma
date
2018-01-223.19123.20633.18283.19473.251310.0453473.251313.3420033.1606173.2051...32.42446432.42446450.39382650.39382644.70556244.70556246.14526246.145262-0.104079-0.070007
2018-01-233.20073.23643.19863.20073.244570.0420743.244573.3287193.1604213.1947...27.45617127.45617112.09310812.09310835.38771835.38771843.07167843.071678-0.108291-0.079818
2018-01-243.23373.23823.17573.23553.240860.0392023.240863.3192653.1624553.2007...31.17443031.17443028.15480828.15480833.32117233.32117240.28581940.285819-0.107148-0.087835
2018-01-253.14513.14843.12153.14513.232450.0408513.232453.3141533.1507473.2355...41.19458041.19458052.07050952.07050938.67812638.67812639.82647839.826478-0.112533-0.094800
2018-01-263.14543.15433.13123.14693.224240.0407123.224243.3056653.1428153.1451...36.82179636.82179645.96752445.96752440.76081140.76081140.09343040.093430-0.120949-0.101018
+

5 rows × 66 columns

+
+ + +###### Slicing the index gives us a pretty simple solution with minimum data miss for the indicator necessity on previous data + + + + +```python +data = data[14:-14] +counter_nan = data.isnull().sum().sort_values(ascending=False) +display(data[counter_nan.keys()].head()) +plt.figure(figsize=(15,10)) +plt.scatter(counter_nan, counter_nan.values) +plt.show() +print " [+] Number of columns that does not have a nan: " + str(len(counter_nan)) +print " [+] Number of total columns: " + str(len(data.columns)) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
cci_20ccilow_deltaumhigh_deltatrclose_-1_ddmwr_6open...mdm_14mdi_14trixkdjjkdjj_9kdjdkdjd_9kdjkkdjk_9trix_9_sma
date
2010-01-22178.996300176.8077990.01300.02690.02690.07760.01800.00009.2927631.7525...0.0019413.0851130.14341697.52837797.52837792.85676892.85676894.41397194.4139710.083942
2010-01-25128.966672124.2965060.01300.0000-0.00880.0558-0.03530.000038.8009991.8189...0.0016532.6593990.15534473.82714873.82714890.13825190.13825184.70121784.7012170.096051
2010-01-26197.350586184.5210320.04740.02470.02470.06250.05010.00009.1176471.8136...0.0014112.2693880.17296882.36216382.36216389.02738289.02738286.80564286.8056420.110112
2010-01-27170.239369148.954115-0.02690.02030.02030.08030.01600.026911.5331491.7860...0.0050907.9531660.19535585.87436685.87436688.57695188.57695187.67608987.6760890.125540
2010-01-28166.319888142.5871030.02040.00490.00490.06480.01840.00002.4297651.8064...0.0043636.8095810.22210194.51622994.51622989.42541989.42541991.12235691.1223560.142624
+

5 rows × 66 columns

+
+ + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_13_1.png) + + + [+] Number of columns that does not have a nan: 66 + [+] Number of total columns: 66 + + +###### After slicing we can backfill NaN values for holidays and exceptional days on the market + + +```python +#Back filling for holidays and exceptional days on the market +data = data.fillna(method='bfill') +data = data[1:-1] +counter_without_nan = data.isnull().sum().sort_values(ascending=False) +print " [+] Number of columns that does not have a nan: " + str(len(counter_without_nan)) +print " [+] Number of total columns: " + str(len(data.columns)) +``` + + [+] Number of columns that does not have a nan: 66 + [+] Number of total columns: 66 + + +## Data Exploring + + +```python +def plot_histogram(x): + plt.figure(figsize=(15,10)) + plt.hist(x, alpha=0.5) + plt.title("Histogram of '{var_name}'".format(var_name=x.name)) + plt.xlabel("Value") + plt.ylabel("Frequency") + plt.show() +``` + + +```python +plot_histogram(data['macdh']) +plot_histogram(data['cci']) +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_18_0.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_18_1.png) + + +###### Exploring the distribution of percentage change in the close value + + +```python +import matplotlib.mlab as mlab + +mu = data['close_-1_r'].mean() +sigma = data['close_-1_r'].std() +x = data['close_-1_r'] +num_bins = 50 +fig, ax = plt.subplots(figsize=(15,10)) +n, bins, patches = ax.hist(x, num_bins, normed=1) +y = mlab.normpdf(bins, mu, sigma) +ax.plot(bins, y, '--') +ax.set_title('Histogram of 1-day Change $\mu=' + str(mu) + '$, $\sigma=' + str(sigma) + '$') +plt.show() +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_20_0.png) + + +###### Making our first label of 1 day future forecast for feature exploration + + +```python +label_display = pd.DataFrame() +label_display['close'] = data['close'] +label_display['from_yesterday_rate'] = data['close_-1_r'] +y1 = data['close_-1_r'].shift(-1) +y1 = y1.apply(lambda x:1 if x>0.0000 else 0) +label_display['y'] = y1 +label_display['c1'] = c1 +display(label_display.head(7)) + +``` + +###### Exploring influence of feature on outcome target + + +```python +def plot_histogram_dv(x,y): + plt.figure(figsize=(15,10)) + plt.hist(list(x[y==0]), alpha=0.5, label='Bear') + plt.hist(list(x[y==1]), alpha=0.5, label='Bull') + plt.title("Histogram of '{var_name}' by Forecast Target".format(var_name=x.name)) + plt.xlabel("Value") + plt.ylabel("Frequency") + plt.legend(loc='upper right') + plt.show() +``` + + +```python +plot_histogram_dv(data['macdh'], y1) +plot_histogram_dv(data['cci'], y1) +plot_histogram_dv(data['adx'], y1) +plot_histogram_dv(data['kdjk'], y1) +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_25_0.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_25_1.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_25_2.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_25_3.png) + + +## Feature Engineering + +###### Normalizing and Standardizing distributions +Different techniques to represent a price movement can be used to select the one with best results + + +```python +data.plot(x=data.index, y=['close_20_sma','adx', 'cci'], figsize=(15, 10)) + +``` + + + + + + + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_28_1.png) + + +#### As shown above, different indicators have different metrics, so we need to normalize in various ways and search for the best results + +###### First let's explore the behavior of each target label + + +```python +#Labeling the different window frames +##Signaling the difference between a feature datapoint and the previous/next one + +def labelwf(dataframe, wf): + for i in wf: + swf = str(i) + dataframe['label' + swf] = \ + (dataframe['close'] - dataframe['close'].shift(i))/dataframe['close'].shift(i) + dataframe['label' + swf] = dataframe['label' + swf].apply(lambda x:1 if x>0.0 else 0) + return dataframe + +#Negative for looking future datapoints +#Positive for looking backwards +window_frames = [-1, -2, -15, 1, 2, 15] +labeled_data = labelwf(data.copy(), window_frames) +index = list(range(len(data))) +index = index[-250:-15] +label1 = labeled_data['label-1'].values +label1 = label1[-250:-15] +label15 = labeled_data['label-15'].values +label15 = label15[-250:-15] +c1 = copy['close_1_r'].apply(lambda x:0 if x>0.000 else 1) +c15 = copy['close_15_r'].apply(lambda x:0 if x>0.000 else 1) +y_5 = copy['close_5_r'].apply(lambda x:0 if x>0.000 else 1) +y_10 = copy['close_10_r'].apply(lambda x:0 if x>0.000 else 1) +y_30 = copy['close_30_r'].applu(lambda x:0 if x>0.000 else 1) +index = list(range(len(c1))) +index = index[-250:-15] + +fig, ax = plt.subplots(figsize=(15, 8), sharey=True) +ax.plot(index, c1[-250:-15], label='1d forward', color='r') +ax.scatter(index, c15[-250:-15], label='15d forward', color='g') +ax.legend() + + +labeled_data['index'] = list(range(len(data))) +data.plot(y='close', figsize=(15, 8)) +for r in labeled_data.iterrows(): + if r[1]['label1'] == 1: + plt.axvline(x=r[1]['index'], linewidth=0.3, alpha=0.3, color='g') + else: + plt.axvline(x=r[1]['index'], linewidth=0.3, alpha=0.3, color='r') + +plt.show() + +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_31_0.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_31_1.png) + + +###### Percentage change of each indicator : (xn - xn-1)/xn-1 where n = [n, n+y, n+2y] and y = Time Frame Window selected + + +```python +#Normalizing the features datapoints +#Accordingly to its window frame + +#Each datapoint to the change percentage of timeframe +def percent_change(dataframe, wf): + new = pd.DataFrame() + swf = str(wf) + for feature in dataframe: + if 'label' in str(dataframe[feature].name): + pass + elif 'change_' in str(dataframe[feature].name): + pass + else: + dataframe['change_' + str(dataframe[feature].name)] = \ + (dataframe[feature] - dataframe[feature].shift(wf))/dataframe[feature].shift(wf) + new['change_' + str(dataframe[feature].name)] = \ + (dataframe[feature] - dataframe[feature].shift(wf))/dataframe[feature].shift(wf) + return dataframe, new + +raw_data = data.copy() +data, percent_change_data = percent_change(data, 1) +data = data.drop('change_pdm', 1) +data = data.drop('change_um', 1) +data = data.drop('change_dm', 1) +percent_change_data = percent_change_data.drop('change_pdm', 1) +percent_change_data = percent_change_data.drop('change_um', 1) +percent_change_data = percent_change_data.drop('change_dm', 1) +percent_change_data = percent_change_data.replace([np.inf, -np.inf], np.nan) +percent_change_data = percent_change_data.fillna(method='bfill') +data = data.replace([np.inf, -np.inf], np.nan) +data = data.fillna(method='bfill') +data.plot(x=data.index, y='change_close_20_sma', figsize=(15,10)) +data.plot(x=data.index, y=['change_kdjk','change_adx', 'change_close_20_sma'], figsize=(15,10)) + +display(data.tail()) +display(percent_change_data.tail()) +plot_histogram_dv(data['change_macdh'], y1) +plot_histogram_dv(data['change_macdh'], c15) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseclose_20_smaclose_20_mstdbollboll_ubboll_lbclose_-1_s...change_mdi_14change_mdichange_dx_14change_dxchange_dx_6_emachange_adxchange_adx_6_emachange_adxrchange_trixchange_trix_9_sma
date
2018-01-013.30753.31173.30753.30763.2968550.0294853.2968553.3558253.2378853.3111...-0.073114-0.073114-6.225712e-16-6.225712e-160.0471430.047143-0.009660-0.009660-0.0345420.016791
2018-01-023.31083.31273.25853.31103.3002750.0266963.3002753.3536663.2468843.3076...0.6955120.6955121.310292e+001.310292e+000.4486620.4486620.1180960.118096-0.0445110.007815
2018-01-033.25743.26383.24103.25783.3011500.0248493.3011503.3508493.2514513.3110...-0.015868-0.0158681.234280e-011.234280e-010.2837900.2837900.1779380.177938-0.126147-0.007375
2018-01-043.23563.24103.22143.23553.3012100.0246803.3012103.3505713.2518493.2578...0.0663330.0663331.039332e-011.039332e-010.2040030.2040030.1881970.188197-0.228872-0.030493
2018-01-053.23283.24793.22563.23313.2985050.0289013.2985053.3563063.2407043.2355...-0.105324-0.105324-1.462284e-01-1.462284e-010.0615500.0615500.1376840.137684-0.352545-0.063682
+

5 rows × 129 columns

+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
change_openchange_highchange_lowchange_closechange_close_20_smachange_close_20_mstdchange_bollchange_boll_ubchange_boll_lbchange_close_-1_s...change_mdi_14change_mdichange_dx_14change_dxchange_dx_6_emachange_adxchange_adx_6_emachange_adxrchange_trixchange_trix_9_sma
date
2018-01-01-0.001057-0.0001510.007770-0.0010570.000762-0.0375820.0007620.0000620.001489-0.000664...-0.073114-0.073114-6.225712e-16-6.225712e-160.0471430.047143-0.009660-0.009660-0.0345420.016791
2018-01-020.0009980.000302-0.0148150.0010280.001037-0.0946020.001037-0.0006430.002779-0.001057...0.6955120.6955121.310292e+001.310292e+000.4486620.4486620.1180960.118096-0.0445110.007815
2018-01-03-0.016129-0.014761-0.005371-0.0160680.000265-0.0691610.000265-0.0008400.0014070.001028...-0.015868-0.0158681.234280e-011.234280e-010.2837900.2837900.1779380.177938-0.126147-0.007375
2018-01-04-0.006692-0.006986-0.006048-0.0068450.000018-0.0068020.000018-0.0000830.000122-0.016068...0.0663330.0663331.039332e-011.039332e-010.2040030.2040030.1881970.188197-0.228872-0.030493
2018-01-05-0.0008650.0021290.001304-0.000742-0.0008190.170995-0.0008190.001712-0.003427-0.006845...-0.105324-0.105324-1.462284e-01-1.462284e-010.0615500.0615500.1376840.137684-0.352545-0.063682
+

5 rows × 63 columns

+
+ + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_33_2.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_33_3.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_33_4.png) + + +#### We see in the above picture that even with the percent change ratio we cant diferentiate on how much that change was significant by some orders of magnitude + +###### Standardized change range : ((xn - xn-1)/xn-1) / (xMax - xMin) + + +```python +#How abnormal was the change compared to the feature range +def normalized_range(dataframe, wf): + swf = str(wf) + new = pd.DataFrame() + for feature in dataframe: + if 'label' in str(dataframe[feature].name): + pass + elif 'change_' in str(dataframe[feature].name): + pass + elif 'rchange_' in str(dataframe[feature].name): + pass + else: + try: + range = dataframe['change_' + str(dataframe[feature].name)].max() - \ + dataframe['change_' + str(dataframe[feature].name)].min() + dataframe['rchange_' + str(dataframe[feature].name)] = \ + dataframe['change_' + str(dataframe[feature].name)] / range + new['rchange_' + str(dataframe[feature].name)] = \ + dataframe['change_' + str(dataframe[feature].name)] / range + except: + pass + return dataframe, new + + +change_data = data.copy() +data, normalized_range_data = normalized_range(data, 1) +data.plot(x=data.index, y=['rchange_close_20_sma','rchange_adx', 'rchange_close'], figsize=(15,10)) +data = data.replace([np.inf, -np.inf], np.nan) +data = data.fillna(method='bfill') +normalized_range_data = normalized_range_data.replace([np.inf, -np.inf], np.nan) +normalized_range_data = normalized_range_data.fillna(method='bfill') + + +display(data.tail()) +display(normalized_range_data.tail()) +plot_histogram_dv(normalized_range_data['rchange_rsi_6'], y1) +plot_histogram_dv(normalized_range_data['rchange_rsi_6'], c15) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseclose_20_smaclose_20_mstdbollboll_ubboll_lbclose_-1_s...rchange_mdi_14rchange_mdirchange_dx_14rchange_dxrchange_dx_6_emarchange_adxrchange_adx_6_emarchange_adxrrchange_trixrchange_trix_9_sma
date
2018-01-013.30753.31173.30753.30763.2968550.0294853.2968553.3558253.2378853.3111...-0.010609-0.010609-2.072037e-19-2.072037e-190.0124960.012496-0.015652-0.015652-0.0002040.000091
2018-01-023.31083.31273.25853.31103.3002750.0266963.3002753.3536663.2468843.3076...0.1009170.1009174.360903e-044.360903e-040.1189260.1189260.1913460.191346-0.0002630.000042
2018-01-033.25743.26383.24103.25783.3011500.0248493.3011503.3508493.2514513.3110...-0.002302-0.0023024.107921e-054.107921e-050.0752240.0752240.2883050.288305-0.000745-0.000040
2018-01-043.23563.24103.22143.23553.3012100.0246803.3012103.3505713.2518493.2578...0.0096250.0096253.459096e-053.459096e-050.0540750.0540750.3049280.304928-0.001352-0.000165
2018-01-053.23283.24793.22563.23313.2985050.0289013.2985053.3563063.2407043.2355...-0.015282-0.015282-4.866762e-05-4.866762e-050.0163150.0163150.2230840.223084-0.002083-0.000345
+

5 rows × 192 columns

+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
rchange_openrchange_highrchange_lowrchange_closerchange_close_20_smarchange_close_20_mstdrchange_bollrchange_boll_ubrchange_boll_lbrchange_close_-1_s...rchange_mdi_14rchange_mdirchange_dx_14rchange_dxrchange_dx_6_emarchange_adxrchange_adx_6_emarchange_adxrrchange_trixrchange_trix_9_sma
date
2018-01-01-0.006705-0.0009150.065792-0.0077160.055369-0.0227550.0553690.0013210.041792-0.004847...-0.010609-0.010609-2.072037e-19-2.072037e-190.0124960.012496-0.015652-0.015652-0.0002040.000091
2018-01-020.0063290.001830-0.1254500.0075030.075386-0.0572780.075386-0.0137630.078024-0.007716...0.1009170.1009174.360903e-044.360903e-040.1189260.1189260.1913460.191346-0.0002630.000042
2018-01-03-0.102309-0.089450-0.045477-0.1172840.019267-0.0418740.019267-0.0179750.0394940.007503...-0.002302-0.0023024.107921e-054.107921e-050.0752240.0752240.2883050.288305-0.000745-0.000040
2018-01-04-0.042451-0.042332-0.051209-0.0499650.001321-0.0041190.001321-0.0017750.003437-0.117284...0.0096250.0096253.459096e-053.459096e-050.0540750.0540750.3049280.304928-0.001352-0.000165
2018-01-05-0.0054890.0129010.011040-0.005414-0.0595470.103531-0.0595470.036623-0.096222-0.049965...-0.015282-0.015282-4.866762e-05-4.866762e-050.0163150.0163150.2230840.223084-0.002083-0.000345
+

5 rows × 63 columns

+
+ + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_36_2.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_36_3.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_36_4.png) + + +#### As we can see, the datapoints are now expressing in a much more intuiteve manner their movements with a same axis of change + +###### Normalized change rate : ( ( (xn - xn-1)/xn-1 ) - (Σxi / n) ) / ( √( (Σxi - (Σxi / n)ˆ2 ) / n ) ) = (Change - Mean) / Standard Deviation + + +```python +#How abnormal was this change percentage ratio in comparison to the others +def normalized_change(dataframe, wf): + swf = str(wf) + new = pd.DataFrame() + for feature in dataframe: + if 'label' in str(dataframe[feature].name): + pass + elif 'change_' in str(dataframe[feature].name): + pass + elif 'rchange_' in str(dataframe[feature].name): + pass + elif 'nchange_' in str(dataframe[feature].name): + pass + else: + try: + std = dataframe['change_' + str(dataframe[feature].name)].std() + mean = dataframe['change_' + str(dataframe[feature].name)].mean() + dataframe['nchange_' + str(dataframe[feature].name)] = \ + (dataframe['change_' + str(dataframe[feature].name)] - mean)/std + new['nchange_' + str(dataframe[feature].name)] = \ + (dataframe['change_' + str(dataframe[feature].name)] - mean)/std + except: + pass + + return dataframe, new + +rchange_data = data.copy() +data, normalized_change_data = normalized_change(data, 1) +data = data.replace([np.inf, -np.inf], np.nan) +data = data.fillna(method='bfill') +normalized_change_data = normalized_change_data.replace([np.inf, -np.inf], np.nan) +normalized_change_data = normalized_change_data.fillna(method='bfill') +data.plot(x=data.index, y=['nchange_close_20_sma','nchange_adx', 'nchange_close'], figsize=(15, 10)) + +display(data.tail()) +display(normalized_change_data.tail()) + +plot_histogram_dv(normalized_change_data['nchange_rsi_6'], y1) +plot_histogram_dv(normalized_change_data['nchange_rsi_6'], c15) + +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseclose_20_smaclose_20_mstdbollboll_ubboll_lbclose_-1_s...nchange_mdi_14nchange_mdinchange_dx_14nchange_dxnchange_dx_6_emanchange_adxnchange_adx_6_emanchange_adxrnchange_trixnchange_trix_9_sma
date
2018-01-013.30753.31173.30753.30763.2968550.0294853.2968553.3558253.2378853.3111...-0.277020-0.277020-0.045644-0.0456440.1346910.134691-0.148862-0.1488620.0077740.020925
2018-01-023.31083.31273.25853.31103.3002750.0266963.3002753.3536663.2468843.3076...1.6123891.612389-0.026412-0.0264122.0339342.0339341.2497291.2497290.0047290.018454
2018-01-033.25743.26383.24103.25783.3011500.0248493.3011503.3508493.2514513.3110...-0.136300-0.136300-0.043832-0.0438321.2540661.2540661.9048401.904840-0.0202090.014271
2018-01-043.23563.24103.22143.23553.3012100.0246803.3012103.3505713.2518493.2578...0.0657630.065763-0.044118-0.0441180.8766650.8766652.0171572.017157-0.0515890.007907
2018-01-053.23283.24793.22563.23313.2985050.0289013.2985053.3563063.2407043.2355...-0.356199-0.356199-0.047790-0.0477900.2028390.2028391.4641681.464168-0.089369-0.001231
+

5 rows × 255 columns

+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
nchange_opennchange_highnchange_lownchange_closenchange_close_20_smanchange_close_20_mstdnchange_bollnchange_boll_ubnchange_boll_lbnchange_close_-1_s...nchange_mdi_14nchange_mdinchange_dx_14nchange_dxnchange_dx_6_emanchange_adxnchange_adx_6_emanchange_adxrnchange_trixnchange_trix_9_sma
date
2018-01-01-0.128347-0.0500570.821339-0.1272590.229905-0.4418050.229905-0.0718890.362274-0.089153...-0.277020-0.277020-0.045644-0.0456440.1346910.134691-0.148862-0.1488620.0077740.020925
2018-01-020.061175-0.003072-1.6767150.0597010.368936-1.0414220.368936-0.2799760.760588-0.124398...1.6123891.612389-0.026412-0.0264122.0339342.0339341.2497291.2497290.0047290.018454
2018-01-03-1.518482-1.565728-0.632093-1.473256-0.020851-0.773884-0.020851-0.3380860.3370150.062559...-0.136300-0.136300-0.043832-0.0438321.2540661.2540661.9048401.904840-0.0202090.014271
2018-01-04-0.648116-0.759089-0.706970-0.646273-0.145504-0.118126-0.145504-0.114609-0.059371-1.470369...0.0657630.065763-0.044118-0.0441180.8766650.8766652.0171572.017157-0.0515890.007907
2018-01-05-0.1106650.1864610.106153-0.098988-0.5682771.751577-0.5682770.415113-1.154962-0.643402...-0.356199-0.356199-0.047790-0.0477900.2028390.2028391.4641681.464168-0.089369-0.001231
+

5 rows × 63 columns

+
+ + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_39_2.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_39_3.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_39_4.png) + + +###### And now, we can evaluate the order of the anomaly of a certain datapoint without losing information on the feature + +###### Normalizing the raw features instead of the change rate + + +```python +#How abnormal is the position that the datapoint is located at +#We substitute the original feature value for this one +def distance(dataframe): + new = pd.DataFrame() + for feature in dataframe: + if 'label' in str(dataframe[feature].name): + pass + elif 'change_' in str(dataframe[feature].name): + pass + elif 'nchange_' in str(dataframe[feature].name): + pass + elif 'rchange_' in str(dataframe[feature].name): + pass + elif 'distance_' in str(dataframe[feature].name): + pass + else: + std = dataframe[feature].std() + mean = dataframe[feature].mean() + dataframe['distance_' + str(dataframe[feature].name)] = (dataframe[feature] - mean)/std + new['distance_' + str(dataframe[feature].name)] = (dataframe[feature] - mean)/std + return dataframe, new + +nchange = data.copy() +data, distance_data = distance(data) +data = data.replace([np.inf, -np.inf], np.nan) +data = data.fillna(method='bfill') +distance_data = distance_data.replace([np.inf, -np.inf], np.nan) +distance_data = distance_data.fillna(method='bfill') +data.plot(x=data.index, y=['distance_close_20_sma','distance_adx', 'close_20_sma'], figsize=(15,10)) + + +display(data.tail()) +display(distance_data.tail()) + +plot_histogram_dv(distance_data['distance_macdh'], y1) +plot_histogram_dv(data['macdh'], y1) +plot_histogram_dv(distance_data['distance_macdh'], c15) +plot_histogram_dv(data['macdh'], c15) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseclose_20_smaclose_20_mstdbollboll_ubboll_lbclose_-1_s...distance_mdi_14distance_mdidistance_dx_14distance_dxdistance_dx_6_emadistance_adxdistance_adx_6_emadistance_adxrdistance_trixdistance_trix_9_sma
date
2018-01-013.30753.31173.30753.30763.2968550.0294853.2968553.3558253.2378853.3111...0.4889660.488966-0.348806-0.348806-0.616822-0.616822-0.665492-0.6654920.3381690.335080
2018-01-023.31083.31273.25853.31103.3002750.0266963.3002753.3536663.2468843.3076...2.3029472.3029471.0643751.064375-0.031028-0.031028-0.485673-0.4856730.3144580.339260
2018-01-033.25743.26383.24103.25783.3011500.0248493.3011503.3508493.2514513.3110...2.2327782.2327781.3719211.3719210.5057430.505743-0.182739-0.1827390.2502500.335284
2018-01-043.23563.24103.22143.23553.3012100.0246803.3012103.3505713.2518493.2578...2.5214542.5214541.6628561.6628561.0011061.0011060.1946730.1946730.1484510.318967
2018-01-053.23283.24793.22563.23313.2985050.0289013.2985053.3563063.2407043.2355...2.0326852.0326851.2109831.2109831.1810511.1810510.5227490.5227490.0275330.285929
+

5 rows × 321 columns

+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
distance_opendistance_highdistance_lowdistance_closedistance_close_20_smadistance_close_20_mstddistance_bolldistance_boll_ubdistance_boll_lbdistance_close_-1_s...distance_mdi_14distance_mdidistance_dx_14distance_dxdistance_dx_6_emadistance_adxdistance_adx_6_emadistance_adxrdistance_trixdistance_trix_9_sma
date
2018-01-011.1293461.1051201.1619481.1309771.128932-0.3264461.1289321.0569271.2036221.136831...0.4889660.488966-0.348806-0.348806-0.616822-0.616822-0.665492-0.6654920.3381690.335080
2018-01-021.1338881.1064951.0938901.1356961.133691-0.4333231.1336911.0540541.2167001.131972...2.3029472.3029471.0643751.064375-0.031028-0.031028-0.485673-0.4856730.3144580.339260
2018-01-031.0603851.0392241.0695841.0618541.134909-0.5040661.1349091.0503041.2233381.136692...2.2327782.2327781.3719211.3719210.5057430.505743-0.182739-0.1827390.2502500.335284
2018-01-041.0303771.0078581.0423611.0309021.134993-0.5105431.1349931.0499341.2239161.062847...2.5214542.5214541.6628561.6628561.0011061.0011060.1946730.1946730.1484510.318967
2018-01-051.0265231.0173501.0481941.0275701.131228-0.3488401.1312281.0575671.2077191.031893...2.0326852.0326851.2109831.2109831.1810511.1810510.5227490.5227490.0275330.285929
+

5 rows × 66 columns

+
+ + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_42_2.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_42_3.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_42_4.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_42_5.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_42_6.png) + + +#### And evaluate the distance of the datapoint address in comparison to the others of its kind + +###### Creating new features via interactions between them + + +```python +from itertools import combinations +from sklearn.preprocessing import PolynomialFeatures + +def add_interactions(df): + # Get feature names + combos = list(combinations(list(df.columns), 2)) + colnames = list(df.columns) + ['_'.join(x) for x in combos] + + # Find interactions + poly = PolynomialFeatures(interaction_only=True, include_bias=False) + df = poly.fit_transform(df) + df = pd.DataFrame(df) + df.columns = colnames + + # Remove interaction terms with all 0 values + noint_indicies = [i for i, x in enumerate(list((df == 0).all())) if x] + df = df.drop(df.columns[noint_indicies], axis=1) + + return df +``` + + +```python +teste = add_interactions(data.copy()) +print (teste.head(5)) +``` + + open high low close close_20_sma close_20_mstd boll \ + 0 1.8189 1.8213 1.7655 1.7835 1.758200 0.028923 1.758200 + 1 1.8136 1.8460 1.8129 1.8336 1.762635 0.033447 1.762635 + 2 1.7860 1.8663 1.7860 1.8496 1.767467 0.038380 1.767467 + 3 1.8064 1.8712 1.8064 1.8680 1.772758 0.043854 1.772758 + 4 1.8200 1.8729 1.8200 1.8729 1.777765 0.048201 1.777765 + + boll_ub boll_lb close_-1_s ... \ + 0 1.816046 1.700354 1.8188 ... + 1 1.829528 1.695742 1.7835 ... + 2 1.844227 1.690707 1.8336 ... + 3 1.860465 1.685051 1.8496 ... + 4 1.874167 1.681363 1.8680 ... + + distance_adx_distance_adx_6_ema distance_adx_distance_adxr \ + 0 2.155962 2.155962 + 1 3.204561 3.204561 + 2 2.238586 2.238586 + 3 1.551822 1.551822 + 4 1.090493 1.090493 + + distance_adx_distance_trix distance_adx_distance_trix_9_sma \ + 0 1.237139 0.663483 + 1 1.774381 1.013457 + 2 1.445436 0.852129 + 3 1.260181 0.754362 + 4 1.133128 0.694455 + + distance_adx_6_ema_distance_adxr distance_adx_6_ema_distance_trix \ + 0 1.881588 1.079697 + 1 2.633112 1.457967 + 2 2.594067 1.674967 + 3 2.216411 1.799871 + 4 1.767651 1.836761 + + distance_adx_6_ema_distance_trix_9_sma distance_adxr_distance_trix \ + 0 0.579046 1.079697 + 1 0.832734 1.457967 + 2 0.987445 1.674967 + 3 1.077429 1.799871 + 4 1.125687 1.836761 + + distance_adxr_distance_trix_9_sma distance_trix_distance_trix_9_sma + 0 0.579046 0.332270 + 1 0.832734 0.461089 + 2 0.987445 0.637585 + 3 1.077429 0.874942 + 4 1.125687 1.169698 + + [5 rows x 51681 columns] + + +## Feature Selection + +###### The methods based on F-test estimate the degree of linear dependency between two random variables. On the other hand, mutual information methods can capture any kind of statistical dependency, but being nonparametric, they require more samples for accurate estimation. + + +```python +import numpy as np +from sklearn.feature_selection import f_classif, mutual_info_classif + +y_15 = c15[15:-15] +y_1 = c1[15:-15] +y_5 = y_5[15:-15] +y_30 = y_30[15:-15] +mi = mutual_info_regression(distance_data, y_15, discrete_features='auto') +#print test.columns +mi /= np.max(mi) +result = distance_data.columns[mi > 0.1] +miresult = result +mi = mi[mi > 0.1] +print len(result) +display(result) +mi_df = pd.DataFrame(index=result, columns=['value']) +mi_df['value'] = mi +mi_df.plot(figsize=(15,10)) +display(mi_df.head()) +print mi_df + +print "\n" + +ftest, _ = f_regression(distance_data, y_15) +ftest /= np.max(ftest) +_[np.isnan(_)] = 0.0 +f = _[~np.isnan(_)] +result = distance_data.columns[f > 0.1] +f = f[f > 0.1] +#print f.max() +#print result.max() +print len(result) +print result + +f_df = pd.DataFrame(index=result, columns=['value']) +f_df['value'] = f +f_df.plot(figsize=(15,10)) +display(f_df.head()) +print f_df + +equal = [] + +for i in miresult.values: + if i in result.values: + equal.append(i) + +print "\n" +display(equal) + + +``` + + 29 + + + + Index([u'distance_open', u'distance_high', u'distance_low', u'distance_close', + u'distance_close_20_sma', u'distance_close_20_mstd', u'distance_boll', + u'distance_boll_ub', u'distance_boll_lb', u'distance_close_-1_s', + u'distance_close_26_ema', u'distance_macd', u'distance_middle', + u'distance_cr-ma1', u'distance_cr-ma3', u'distance_open_2_sma', + u'distance_middle_14_sma', u'distance_middle_20_sma', u'distance_atr', + u'distance_close_10_sma', u'distance_close_50_sma', u'distance_dma', + u'distance_atr_14', u'distance_dx_14', u'distance_dx', + u'distance_dx_6_ema', u'distance_adx', u'distance_trix', + u'distance_trix_9_sma'], + dtype='object') + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
value
distance_open0.440642
distance_high0.443556
distance_low0.505598
distance_close0.468534
distance_close_20_sma0.491667
+
+ + + value + distance_open 0.440642 + distance_high 0.443556 + distance_low 0.505598 + distance_close 0.468534 + distance_close_20_sma 0.491667 + distance_close_20_mstd 0.217032 + distance_boll 0.494343 + distance_boll_ub 0.829823 + distance_boll_lb 0.555011 + distance_close_-1_s 0.442161 + distance_close_26_ema 0.729244 + distance_macd 0.168234 + distance_middle 0.637619 + distance_cr-ma1 0.207764 + distance_cr-ma3 0.198476 + distance_open_2_sma 0.450697 + distance_middle_14_sma 0.642620 + distance_middle_20_sma 0.506292 + distance_atr 0.241409 + distance_close_10_sma 0.624836 + distance_close_50_sma 1.000000 + distance_dma 0.172680 + distance_atr_14 0.246042 + distance_dx_14 0.185833 + distance_dx 0.173521 + distance_dx_6_ema 0.113376 + distance_adx 0.113376 + distance_trix 0.319277 + distance_trix_9_sma 0.260197 + + + 24 + Index([u'distance_open', u'distance_high', u'distance_low', u'distance_close', + u'distance_boll_lb', u'distance_close_-1_s', u'distance_close_-1_d', + u'distance_close_-1_r', u'distance_middle', u'distance_cr-ma3', + u'distance_rsv_9', u'distance_kdjk_9', u'distance_kdjk', + u'distance_kdjj_9', u'distance_kdjj', u'distance_open_2_sma', + u'distance_wr_10', u'distance_middle_14_sma', u'distance_close_10_sma', + u'distance_pdm_14_ema', u'distance_pdm_14', u'distance_adx_6_ema', + u'distance_adxr', u'distance_trix_9_sma'], + dtype='object') + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
value
distance_open0.191533
distance_high0.181462
distance_low0.210108
distance_close0.138125
distance_boll_lb0.141074
+
+ + + value + distance_open 0.191533 + distance_high 0.181462 + distance_low 0.210108 + distance_close 0.138125 + distance_boll_lb 0.141074 + distance_close_-1_s 0.141206 + distance_close_-1_d 0.740016 + distance_close_-1_r 0.530851 + distance_middle 0.174595 + distance_cr-ma3 0.211435 + distance_rsv_9 0.249812 + distance_kdjk_9 0.276445 + distance_kdjk 0.276445 + distance_kdjj_9 0.714550 + distance_kdjj 0.714550 + distance_open_2_sma 0.184072 + distance_wr_10 0.488122 + distance_middle_14_sma 0.110842 + distance_close_10_sma 0.116276 + distance_pdm_14_ema 0.299721 + distance_pdm_14 0.299721 + distance_adx_6_ema 0.506360 + distance_adxr 0.506360 + distance_trix_9_sma 0.250674 + + + + + + ['distance_open', + 'distance_high', + 'distance_low', + 'distance_close', + 'distance_boll_lb', + 'distance_close_-1_s', + 'distance_middle', + 'distance_cr-ma3', + 'distance_open_2_sma', + 'distance_middle_14_sma', + 'distance_close_10_sma', + 'distance_trix_9_sma'] + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_49_7.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_49_8.png) + + + +```python +from sklearn.decomposition import PCA + + + +pca = PCA(n_components=2) +data_pca = pd.DataFrame(pca.fit_transform(distance_data)) +#display(data_pca.head()) +data_pca.plot(figsize=(15,10)) + +datatest = pca.fit_transform(distance_data) +plt.figure(num=None, figsize=(18, 11), dpi=80, facecolor='w', edgecolor='k') +plt.scatter(datatest[:, 0], datatest[:, 1]) +plt.show() +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_50_0.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_50_1.png) + + +###### T - Distributed Stochastic Neighboor Embedding +Transforming the data into a Similarity Matrix for comparing the similarity of a certain datapoint with the rest + + +```python +from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import StandardScaler +from sklearn.cross_validation import train_test_split +from sklearn.metrics import accuracy_score + +# t-distributed Stochastic Neighbor Embedding (t-SNE) visualization +from sklearn.manifold import TSNE +tsne = TSNE(n_components=2, random_state=0) +x_test_2d = tsne.fit_transform(distance_data) +#y_test = y_15 + +y_tsne = [] +for key, i in np.ndenumerate(y_15): + if i == 0: + if y_1[key[0]] == 0: + y_tsne.append(0) + elif y_1[key[0]] == 1: + y_tsne.append(1) + if i == 1: + if y_1[key[0]] == 0: + y_tsne.append(2) + elif y_1[key[0]] == 1: + y_tsne.append(3) + +y_test = np.array(y_tsne) + + +markers=('s', 'd', 'o', '^', 'v') +color_map = {0:'red', 1:'blue', 2:'lightgreen', 3:'purple'} +plt.figure(figsize=(15,10)) +for idx, cl in enumerate(np.unique(y_test)): + plt.scatter(x=x_test_2d[y_test==cl,0], y=x_test_2d[y_test==cl,1], c=color_map[idx], marker=markers[idx], label=cl, alpha=0.5) +plt.xlabel('X in t-SNE') +plt.ylabel('Y in t-SNE') +plt.legend(loc='upper left') +plt.title('t-SNE visualization of test data') +plt.show() + + +``` + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_52_0.png) + + +# Facebook Time series forecasting + +###### Prophet library + + +```python +from fbprophet import Prophet +import numpy as np + +test = data.copy() +test['ds'] = data.index +test['y'] = np.log(data['close']) +display(test.tail()) +m = Prophet() +m.fit(test) +future = m.make_future_dataframe(periods=365) +forecast = m.predict(future) +forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']] +m.plot(forecast) +m.plot_components(forecast) +``` + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
openhighlowcloseadj closevolumedsy
Date
2018-02-053.21693.24553.21383.21543.21540.02018-02-051.167952
2018-02-063.26113.27593.21753.26113.26110.02018-02-061.182065
2018-02-073.23333.26303.23143.23343.23340.02018-02-071.173534
2018-02-083.26963.29263.25623.26993.26990.02018-02-081.184759
2018-02-093.28443.30753.27083.28463.28460.02018-02-091.189245
+
+ + + INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. + /Library/Python/2.7/site-packages/pystan/misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. + elif np.issubdtype(np.asarray(v).dtype, float): + + + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_55_2.png) + + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_55_3.png) + + + +![png](Dataset%20modeling%20for%20financial%20time%20series%20data_files/Dataset%20modeling%20for%20financial%20time%20series%20data_55_4.png) + diff --git a/README2.md b/README2.md new file mode 100755 index 0000000..8170ef9 --- /dev/null +++ b/README2.md @@ -0,0 +1,5 @@ +# Kamaji +#### Documentation provided in IPython Notebook - Algar Kamaji Documentation + [Data prep, cleaning and tests results](https://github.com/sudoFerraz/Kamaji/blob/master/Dataset%20modeling%20for%20financial%20time%20series%20data.ipynb) + + #### This is a full fledged application with a Flask Backend serving the model results