From 04a1ad33474e2be6648c7a37ac5c3d9e5fe82745 Mon Sep 17 00:00:00 2001 From: MasterDoo Date: Sat, 1 Sep 2018 15:27:03 +0800 Subject: [PATCH] complete UnivariateContainer class and change input feed from ndarray to series --- .../__pycache__/containers.cpython-36.pyc | Bin 4626 -> 6788 bytes .../__pycache__/methods.cpython-36.pyc | Bin 6203 -> 6203 bytes k models/exchange/containers.py | 123 +++++++++++++----- k models/exchange/ex_model.py | 14 +- k models/pandas/_libs/parsers.pyx | 0 space.code-workspace | 11 ++ 6 files changed, 111 insertions(+), 37 deletions(-) create mode 100644 k models/pandas/_libs/parsers.pyx create mode 100644 space.code-workspace diff --git a/k models/exchange/__pycache__/containers.cpython-36.pyc b/k models/exchange/__pycache__/containers.cpython-36.pyc index 6f96530323d87dc64432862d681edb0ee1988b0e..6902881624b8332fff0f3440a7d8068ff48d2a08 100644 GIT binary patch literal 6788 zcma)BNpl;=6`manL2zG1Nu$NGpeUeZJBh=RZCR3~*d@~`OD;P|oXH5?AO{-Ez&#C# zM1hx3uE?oUs(g$sU2^j+x8#(2{=yt`$swh`piA<-9xMb!CjqPX*Kd0H-qw75afq3t2I$8>BP$lCtKxOS=H^Z*qW+Msk#$Rw`OWHt=ZbFuHDjDj^!U|EWdBm=Flpz zB3ea0w{OoexEAgPO>ErU36x4GcO=UBhCSg;s#nxF9NaU}g9Qf!TSBXtCq9TC+H%%>lfl&&Ks zC1esuw6EhpXp81%_ZT=df~}75)yC@Eynf$p`$@uC^)A+Kw?z<1EZRzV*bsTJ_)ZhV z>BZ01KS{Vq)@9&FyB*e9zY*6vEgs2a{Z=5iJNMS#jiUGJakPHlZN-d-$vWSuZ~M^} zUq^4Y?xfn@O-t&t9y+mFP#)`(v)Lzlp z^iz|~uvyHUVy`Ni5})QX%~@V*7;OHj#TM8iM(5c1{%DCxwuF&owt|s)R_TvW|1t1e zW5?0Ipn4aZOU-4ff!hhJ@+5Pyt`)`O6xMl~ok4qb*gnf%LHih61!gX!bHMEbUj)7! z!LNJ0=;AjfFLB-VBj#o|?$$$q%B3r*D$*Oi^xYv0R9A|dYBZkfIN+^s{z$*Szw?|# zLQofO+i7vR9kXtp=>=)h()8EEpGQQSwB33drdM2E9fsyrk2v6{Rn>xbyz0(J-un|Ls8u~k~byP)}ZHG zjhWBPeNak>Cs8rnBYGWGL;nN5;MJ-;G;4pOeCBhTY&1*$cHm=*fA!HU4a#1j*v zMYJc7s5Dp!{gY1&#i4}JNqn)4k;!khXIL-zYt2#iZ-?gKyUa5qR?zSHJ)$(bu-ciVr!q0n1zy>?L*-C0-gIu9OT zIEWI-eOB#O|2N-cIBmmKAnne%@jYn82TGkL?n5X-S1i&pi*AGaP|iH59t~i^CSaS| z3eEy{Q-cC7$XNQJM__;RPNxml0d%u_oBJXPqAmC5BEA>$mV1lC3P2=T_RS3P*bL)4 zu2{)<<3t3U(DFy5YF=8Z;NjnTAwHoM5wmI8Sl1Z{nKSk_*<|f;In{T+? z;}?$@-W?9F9U7)@jwT;9I-FZMrF3;#eix=lkZpR8tf?Twh-2bWYO6(vB6Z}VPRql= zXBC{YI7ZzQ9;pY|>Zs0Z6QepyXS^*AL)~e6VgDd&IcgVzQMds^>IK7eBJo?W z+FlZLd1}HY;$}$SsCH2uOje)p)FFCwo@p){_-s(wO&vAsvwn%TmylnO7F2=jiE8Ek zq-=zx#r{y-k%O__!B_#bQ36fZs!e8``y>4(2>p7qWh6lrT1%vsAxfRjqk_9`xO{k>Cu4u{vO1 z(FG~mI;nK}kqA#ADb2fy3e%+~p{4fWr$DP1lCRuF;9xnGqnC4cAkPO@e*Sw% zfnSpK1C-@)%1R+M6Dd+l?zVyYZN8Hl0TZv_n#4KEen453vL7O=7z$)*Uhp9E#=>7-;2?Vuwud$!R0GekinH41p1M>v60j={)S2qJ~}PsOE_@Kgd?V*x}oiv zdsfeE!d=KrI-G`^67C}P0ipzD1BMX}1>6Qp+5=hw{+`{ln+`c3Fz6;zJbQ50R<=&l;o458DGeThSGIPA}KX_X-V)Bb?35I?ddQWnaAyl>7X}kX}{ANDsp|4mk0#l zQkhHIzHtKzYtMSDZRtI0N!znPZ{j`>COk2|(jG&J;T91vJTVZ8<+gN|XT~G#FbB*8 z4jyZs%?f+YW3A^jC~PZs|1qY)2BzPIt>Oa8in#-;=Ju7#xE6wr$ZUoyx4A2W7KaWH z0Z#_x@EhPI`5D8^;TUin`?7Lx;F@G!R^s|e#&Mv7f~eie+>ClhzzoZbaEu4dQU3X8 zjYFX)<3M+yA~W)$wjgVtm-lp>Af#^Su6ygPII z(t3Z->)G~R+SDCd!of{Jkw7s#GqO#An|?dnnIUe2R(wd=Au;#}eP0nHIx1%vDH&~+ z7RY|x6o|OQ4GgN7r|!$tA$&*hP3m#jX+=pTcR(mzP>K&I`;_JsK%*k;B-4fTb*Ndg z(?Z3~=TYk-Wn!2w8f)Yk;E((S@))|W{9mU#kiiqgnOu;U1#qZ#1%@F$`h8PG5;kDZ z+Ia&W13U$S1PR~gry)3c4>1_!J!4s0(UwVdS`b5rlC%EAgQD(9oIx#h{C1m1EH$EbYWqU?yW%EMD69J_YGb!HI<*BN zBA}!uJlMl%p^1M$N@kI1RvtG5pDpv|pgg}vWsV%@O}LSyD&YV_ODjK)R3iOIOsP)$ zNC-e=6v^~41BP!A(Xxtde8(=el2y zo&3#MKtmx=A*V>HPz&!7au?AU%ndrTi@5B@QMfxgbn@8Fj2`s1gDt${WwhkJ4Y#R$ zqE5RV3t4qPNvMhFup<&gA;WbXe0s=C=r7*?Z%Z2EuJoo_q}yMG#JiY#*~Pn|8^IFq zlG~Xr0S|Gbcv0boZh_(;WpI^859v{!k1}JQ(~v^MxUNWN;o(0(Fm)g*dP=uNWSTt< zFckHYzG!sM9VxOeO@_OtKB#a&`iFjm1F9<=oJNIAWz*Ov14xby?it39Y6=0$puvN) z5oz65HYS~HlNT1(c|u2@mhVVBBl&`5@29ER2&DK47$4kp>SQqzqCCi1KLk3A{x>k` zJEVk6xn@pdyWQ0zx2l*y&(ZeA-cpAM{}>|@2yJ2~>OE*<*aF_g1sYN4r?;XFW!TY^ zz0u%3NpI4NA-Qez>O*=6e1LR2bH~JSu&0NO;iCm@3+R9zKxDz$Ma0-~CuHtDylq5T z!0n!oE+QxpR9>!B@ggAIOuINrr--v)nqfw%k{_d)&Zt{WxHpU=`%ZjzwTBQiH9(L$ zAaD22V3CkBu9+91MM`@2%#r8WhfaUi*oo?xtgu4BQSUnX=D_NWFnwqp6OhLk?=+%Q zmw&}PW&PmQAL;TPpv9Ha_pn8>yW;%=b?Tr{VogvqDyAu$L6#zBVR0QX1O+X4U;q?J z8@0`{FKTWIP4wWbtdtJj#AUnNJzP zRdC80l#Px+)s*!m-7%Q6rR~#uh;AUza(}-yKNKbwoOV~6%d5BX{%Nch@ce+r9sQqvrg?gYio9s z3M(jwz5p}&boN(!gjZ0`O)>Q(hz%lYzzT9m>)yvt=x%*g>UX2k9Ww&a&K&!SbxIl~~L% zXLrZOl@(#9#LBRfS8|k9z-pC^fSrPp72BnD8L6fQdnYwE%En-`q72mF&^j9jzPiUx zuxEfDVFw^2hr};>x9WiVZbF(XL42#rq~7tDWkPg}1JAjd)VY60nvHJQY%Mo*(8Pf& zUi=cw=x-wK8@F}Rw#2DKZ6mc#+BQsFHI}InmiQF*E&(?}V*2pSOGwnP1ISP-HvJ zz>imX5Y6yW5b!YoSKFs~9R*aT;%@ABJh(Qmp`^ky;*vErVgVzQ;p&Rly%`M%uqZaI z>8#YF)t)rh{LX3+iJz@w8>SaE{7xWEweZC>Yc-n#8FO`>0$Y3>VFF>l)Ue8=<$0~J z6?@(R;35Zr*g0xy1zMpRb!d(nw5}V#*;)p$r1vKtUpNjfb;DTP%UBx&8!qKZFwmfA z1_s6lPf^J_4FrS0x5?LpK`7f4R!oNms{yMHYXVjSR`Z+mM|7P+n3D_rY1jX z{n&TVhZEdyg#mZ&wBlP%zvMvF0>0+Qt!@}Oe#o5u`+F%lAyVnn+Kesc(oS*;Re|mR zJ-JJNnLc?f!4HGG{0PEPgyRUu03eMRQEBm@$Gs@#Pe(NAq15xvgI0u31xe1({u9-s_|Ye<8^6*$ITnxWrpbvlCxs{er$OC2YceH4pdbCLNm7GLUzrF@Z6TZDy) z5tUMm^FQ`%rxnGG?n>|6b#b?Fg;vCa!UtDT0sKA-3x5IO$s*t{f$Tk)>Zvh_Yf7!h zWD3K3h5IXkOmIJ34#ak`K4%UJ3bU#F=4X(a1CXgkx8sGNAE^VN8eYUb^o&mdyaqEW ziNA_ftv?0vsEUK7azRk^(67fah43mj{RV>dl~1j9{91Fy!qz;YrbT}qSmXIabjYT3lQ3WR$Vtcmb9@s10IBpK>S$^7vKqi*hbAR zUvZ5gU#H=3(8-X2-A;bm$rGT^h8bab5a?H%_Ur1D`y-Zt4xyK?{Fj%YBeT%{YXOf3 zr}RMd*yTgJE@#lr^#9pO^8j9INYH8{jzoLx_{IW64D-iN0X(Kr;Er;LFX1@^__o4J z=XtE#@H~D+aUoObm7!TX25dM#($v{0(2EAS4EB?>x6UAM%qiEnGC z%gSf72d|7D4Lq6`f7H&;s}HtP9Xd*#jOR&fcBRW!JHa{C+W0*%s6{Oe=H4$u;iu=! Fe*s(;#!mnM diff --git a/k models/exchange/__pycache__/methods.cpython-36.pyc b/k models/exchange/__pycache__/methods.cpython-36.pyc index 476709b91e8bdc5756cb5a9e54827dd75616a5fc..d179a57a47ccc49e66d379a6c79e890e70e0451d 100644 GIT binary patch delta 290 zcmdmOu-ky$n3tE!Q@Cp*`!p`DTkIu8iJ5uv5w|8U=Q88G#a5D9Tms@f<7#B|+nm8& z&&U|Q`2~+PBU>Z`14B{FWKF&;Vo5;eE!Nb6{N#)%_Po^6qWrw#qTtD&`C^0$fKr+) zMfo7bAZ0+|ME+nlFf(QHE`B9$WU1%;W{ky~$a* zlumXP(_w6wTqKsj*fIICm=$+DNVpk9Bu$nO_fo6|GH>x0XXm6Q7Uk*Xrj`_CCKnem ogA}lUh`Pzm;ub1iAZ9R#NCpugXBOpwhn-xrSo0BI{wXaE2J diff --git a/k models/exchange/containers.py b/k models/exchange/containers.py index 61b683b..0619347 100644 --- a/k models/exchange/containers.py +++ b/k models/exchange/containers.py @@ -13,10 +13,12 @@ from matplotlib import pyplot as plt from sklearn.metrics import mean_squared_error from sklearn.preprocessing import MinMaxScaler +from typing import Tuple, Union class BaseContainer(): def check_config(self, cf: dict) -> bool: + # TODO: Add config check print("Configuration check passed.") return True @@ -29,69 +31,100 @@ class UnivariateContainer(BaseContainer): def __init__( self, - raw: np.ndarray, + series: pd.Series, config: dict={ "method": "diff", - "diff_lag": 1, - "diff_order": 1, + "diff.lag": 1, + "diff.order": 1, "test_ratio": 0.2, - "lag_for_sup": 32 + "lag_for_sup": 3, + "target_idx": 0 }): assert self.check_config(config) self.config = config # Input format: num_obs * num_fea - assert len(raw.shape) == 2, \ - f"UnivariateDataContainer: Raw value is expected to have dim=2, dim={len(raw.shape)} received instead." + assert len(series.shape) == 1, \ + f"UnivariateDataContainer: Series feed is expected to have shape=(n,) as univariate series, \ + but shape={len(series.shape)} received instead." + self.series = series + self.raw = self.series.values.reshape(-1, 1) print( - f"UnivariateDataContainer: Raw values received: {raw.shape[0]} observations with {raw.shape[1]} features.") - self.num_obs, self.num_fea = raw.shape + f"UnivariateDataContainer: Univariate series with {len(self.series)} obs received.") + self.num_obs, self.num_fea = self.raw.shape self.differenced = self._difference( - raw, lag=self.config["diff_lag"], order=self.config["diff_order"] + self.raw, lag=self.config["diff.lag"], order=self.config["diff.order"] ) - self.sup_set = self._gen_sup_learning( + self.sup_set, self.tar_idx = self._gen_sup_learning( self.differenced, total_lag=self.config["lag_for_sup"]) + print( + f"Supervised Learning Problem Generated with target index {self.tar_idx}") self.sup_set = self.sup_set.values + self.sup_fea = self.sup_set.shape[1] - self.sample_size = len(self.sup) + self.sample_size = len(self.sup_set) self.test_size = int(self.sample_size * config["test_ratio"]) self.train_size = int(self.sample_size - self.test_size) # Split data # Note: idx0 = obs idx, idx1 = feature idx # Note: feature idx = 0 --> target - self.train_X, self.train_y, self.test_X, self.test_y = self._split_data(self.sup) - - print(f""" - Test and training data spliting finished: + + self.train_X, self.train_y, self.test_X, self.test_y \ + = self._split_data( + self.sup_set, + tar_idx=self.tar_idx + ) + + print(f"""\tTest and training data spliting finished: train X shape: {self.train_X.shape}, train y shape: {self.train_y.shape}, test X shape: {self.test_X.shape}, test y shape: {self.test_y.shape}""") # Scale the training data. - # self.scaler, self.diff_scaled = self._scale(self.differenced) + # Scaler are created based on training data set. NOT the whole dataset. + self.scaler_in, self.train_X_scaled = self._scale(self.train_X) + self.scaler_out, self.train_y_scaled = self._scale(self.train_y) + + self.test_X_scaled = self.scaler_in.transform(self.test_X) + self.test_y_sacled = self.scaler_out.transform(self.test_y) def __str__(self) -> str: - repr_str = f"""Univariate Data Contrainer at {hex(id(self))} - with {self.num_obs} obs and {self.num_fea} features, - Supervised Learning problem generated. - Total sample size: {self.sample_size} obs. - Training set size: {self.train_size} obs. - Testing set size: {self.test_size} obs. + # TODO: Add shapes of train/test Xy sets to report string. + repr_str = f"""\t{str(type(self))} object at {hex(id(self))} + Raw Data: + Dataset size: {self.num_obs} obs. + Number of features: {self.num_fea} features. + Supervised Learning problem generated: + Total sample size: {self.sample_size} obs. + Training set size: {self.train_size} obs. + Testing set size: {self.test_size} obs. """ return repr_str - + def __repr__(self): - self.__str__() + return self.__str__() + + def _split_data(self, data: np.ndarray, tar_idx: Union[int, list]=0) -> Tuple[np.ndarray]: + # Returing order: (train_X, train_y, test_X, test_y), univariate. + train, test = data[:self.train_size], data[self.train_size:] + assert train.shape[1] == self.sup_fea, \ + f"Got train shape: {train.shape}, expected feature: {self.sup_fea}" + fea_idx = list(range(train.shape[1])) + fea_idx.remove(tar_idx) - def _split_data(self): - pass # FIXME: Stopped here Sep. 1 2018 + train_X = train[:, fea_idx] + train_y = train[:, tar_idx].reshape(-1, 1) + test_X = test[:, fea_idx] + test_y = test[:, tar_idx].reshape(-1, 1) + + return (train_X, train_y, test_X, test_y) def _gen_sup_learning(self, data: np.ndarray, total_lag: int=1, nafill: object=0.0) \ - -> pd.DataFrame: + -> (pd.DataFrame, int): """ Generate superized learning problem. Transform the time series problem into a supervised learning @@ -107,11 +140,12 @@ def _gen_sup_learning(self, data: np.ndarray, total_lag: int=1, nafill: object=0 col_names = ["L0/current/target"] + \ [f"L{i}" for i in range(1, total_lag+1)] df.columns = col_names - return df + tar_idx = 0 + return (df, tar_idx) def _difference(self, data: np.ndarray, lag: int=1, order: int=1) -> np.ndarray: """ - Note: set lag=1 & order=0 to use the original data. + Note: set lag=1 & order=0 to use the original data. """ if order != 0: diff = list() @@ -123,17 +157,34 @@ def _difference(self, data: np.ndarray, lag: int=1, order: int=1) -> np.ndarray: return self._difference(diff, lag, order-1) return data - def _invert_difference(self, data: np.ndarray) -> np.ndarray: - pass - - def _scale(self, data: np.ndarray) \ - -> (sklearn.preprocessing.StandardScaler, np.ndarray): + def _invert_difference(self, data: np.ndarray, idx: int) -> np.ndarray: + """ + For initial stationarity removal order=1 only. + #TODO: add higher order of differencing support. Using recursion + """ + assert self.config["diff.order"] == 1, \ + "Initial stationarity removal differencing with order higher than 1 are not yet supported." + + lag = self.config["diff.lag"] + if idx - lag >= 0: + return self.raw[idx - lag + 1] + data + else: + return data + + def _scale(self, data: np.ndarray) -> ( + sklearn.preprocessing.StandardScaler, + np.ndarray): scaler = sklearn.preprocessing.StandardScaler().fit(data) data_scaled = scaler.transform(data) return scaler, data_scaled - def _invert_scale(self, data: np.ndarray): - pass + def _invert_scale_y(self, data: np.ndarray): + # Invert scale the output from model. + # Assert data type to be univariate time series data with shape (n,) or (n, 1) + assert len(data.shape) == 1 or (len(data.shape) == 2 and data.shape[1] == 1), \ + f"Unexpected data array feed, should be in shape (n,) or (n,1). Get {data.shape}" + data = data.reshape(-1, 1) + return self.scaler_out.inverse_transform(data) def reconstruct(self, data: np.ndarray): pass diff --git a/k models/exchange/ex_model.py b/k models/exchange/ex_model.py index bbbc492..630bba3 100644 --- a/k models/exchange/ex_model.py +++ b/k models/exchange/ex_model.py @@ -10,8 +10,12 @@ from matplotlib import pyplot as plt from sklearn.metrics import mean_squared_error from sklearn.preprocessing import MinMaxScaler +from pprint import pprint +import methods from methods import * +import containers +from containers import * config = { "batch_size": 1, @@ -22,10 +26,17 @@ } # Load dataset. -series = load_dataset(dir="../data/DEXCHUS.csv") +series = load_dataset( + dir="/Users/tianyudu/Documents/Github/AnnEcon/k models/exchange/DEXCHUS.csv") # Transform to stationary data to Delta 1 raw_values = series.values + +sample_data = np.array([x ** 2 for x in range(10)]).reshape(-1,1) + +c = UnivariateContainer(sample_data) +# cont = UnivariateContainer(raw_values.reshape(-1,1)) + # diff would have length = len(raw_value) - 1 as it's taking the gaps. diff_values = difference(raw_values, lag=1) @@ -43,6 +54,7 @@ f"Total sample found {total_sample_size}, {test_size} will be used as test set." ) train, test = sup[:-test_size], sup[-test_size:] +train2, test2 = sup[:train_size], sup[train_size:] # Generate scaler and scaling datasets. # scaler on input matrix(X) and output(y) diff --git a/k models/pandas/_libs/parsers.pyx b/k models/pandas/_libs/parsers.pyx new file mode 100644 index 0000000..e69de29 diff --git a/space.code-workspace b/space.code-workspace new file mode 100644 index 0000000..25033e6 --- /dev/null +++ b/space.code-workspace @@ -0,0 +1,11 @@ +{ + "folders": [ + { + "path": "k models" + }, + { + "path": "." + } + ], + "settings": {} +} \ No newline at end of file