diff --git a/k models/exchange/__pycache__/containers.cpython-36.pyc b/k models/exchange/__pycache__/containers.cpython-36.pyc index 6f96530..6902881 100644 Binary files a/k models/exchange/__pycache__/containers.cpython-36.pyc and b/k models/exchange/__pycache__/containers.cpython-36.pyc differ diff --git a/k models/exchange/__pycache__/methods.cpython-36.pyc b/k models/exchange/__pycache__/methods.cpython-36.pyc index 476709b..d179a57 100644 Binary files a/k models/exchange/__pycache__/methods.cpython-36.pyc and b/k models/exchange/__pycache__/methods.cpython-36.pyc differ diff --git a/k models/exchange/containers.py b/k models/exchange/containers.py index 61b683b..0619347 100644 --- a/k models/exchange/containers.py +++ b/k models/exchange/containers.py @@ -13,10 +13,12 @@ from matplotlib import pyplot as plt from sklearn.metrics import mean_squared_error from sklearn.preprocessing import MinMaxScaler +from typing import Tuple, Union class BaseContainer(): def check_config(self, cf: dict) -> bool: + # TODO: Add config check print("Configuration check passed.") return True @@ -29,69 +31,100 @@ class UnivariateContainer(BaseContainer): def __init__( self, - raw: np.ndarray, + series: pd.Series, config: dict={ "method": "diff", - "diff_lag": 1, - "diff_order": 1, + "diff.lag": 1, + "diff.order": 1, "test_ratio": 0.2, - "lag_for_sup": 32 + "lag_for_sup": 3, + "target_idx": 0 }): assert self.check_config(config) self.config = config # Input format: num_obs * num_fea - assert len(raw.shape) == 2, \ - f"UnivariateDataContainer: Raw value is expected to have dim=2, dim={len(raw.shape)} received instead." + assert len(series.shape) == 1, \ + f"UnivariateDataContainer: Series feed is expected to have shape=(n,) as univariate series, \ + but shape={len(series.shape)} received instead." + self.series = series + self.raw = self.series.values.reshape(-1, 1) print( - f"UnivariateDataContainer: Raw values received: {raw.shape[0]} observations with {raw.shape[1]} features.") - self.num_obs, self.num_fea = raw.shape + f"UnivariateDataContainer: Univariate series with {len(self.series)} obs received.") + self.num_obs, self.num_fea = self.raw.shape self.differenced = self._difference( - raw, lag=self.config["diff_lag"], order=self.config["diff_order"] + self.raw, lag=self.config["diff.lag"], order=self.config["diff.order"] ) - self.sup_set = self._gen_sup_learning( + self.sup_set, self.tar_idx = self._gen_sup_learning( self.differenced, total_lag=self.config["lag_for_sup"]) + print( + f"Supervised Learning Problem Generated with target index {self.tar_idx}") self.sup_set = self.sup_set.values + self.sup_fea = self.sup_set.shape[1] - self.sample_size = len(self.sup) + self.sample_size = len(self.sup_set) self.test_size = int(self.sample_size * config["test_ratio"]) self.train_size = int(self.sample_size - self.test_size) # Split data # Note: idx0 = obs idx, idx1 = feature idx # Note: feature idx = 0 --> target - self.train_X, self.train_y, self.test_X, self.test_y = self._split_data(self.sup) - - print(f""" - Test and training data spliting finished: + + self.train_X, self.train_y, self.test_X, self.test_y \ + = self._split_data( + self.sup_set, + tar_idx=self.tar_idx + ) + + print(f"""\tTest and training data spliting finished: train X shape: {self.train_X.shape}, train y shape: {self.train_y.shape}, test X shape: {self.test_X.shape}, test y shape: {self.test_y.shape}""") # Scale the training data. - # self.scaler, self.diff_scaled = self._scale(self.differenced) + # Scaler are created based on training data set. NOT the whole dataset. + self.scaler_in, self.train_X_scaled = self._scale(self.train_X) + self.scaler_out, self.train_y_scaled = self._scale(self.train_y) + + self.test_X_scaled = self.scaler_in.transform(self.test_X) + self.test_y_sacled = self.scaler_out.transform(self.test_y) def __str__(self) -> str: - repr_str = f"""Univariate Data Contrainer at {hex(id(self))} - with {self.num_obs} obs and {self.num_fea} features, - Supervised Learning problem generated. - Total sample size: {self.sample_size} obs. - Training set size: {self.train_size} obs. - Testing set size: {self.test_size} obs. + # TODO: Add shapes of train/test Xy sets to report string. + repr_str = f"""\t{str(type(self))} object at {hex(id(self))} + Raw Data: + Dataset size: {self.num_obs} obs. + Number of features: {self.num_fea} features. + Supervised Learning problem generated: + Total sample size: {self.sample_size} obs. + Training set size: {self.train_size} obs. + Testing set size: {self.test_size} obs. """ return repr_str - + def __repr__(self): - self.__str__() + return self.__str__() + + def _split_data(self, data: np.ndarray, tar_idx: Union[int, list]=0) -> Tuple[np.ndarray]: + # Returing order: (train_X, train_y, test_X, test_y), univariate. + train, test = data[:self.train_size], data[self.train_size:] + assert train.shape[1] == self.sup_fea, \ + f"Got train shape: {train.shape}, expected feature: {self.sup_fea}" + fea_idx = list(range(train.shape[1])) + fea_idx.remove(tar_idx) - def _split_data(self): - pass # FIXME: Stopped here Sep. 1 2018 + train_X = train[:, fea_idx] + train_y = train[:, tar_idx].reshape(-1, 1) + test_X = test[:, fea_idx] + test_y = test[:, tar_idx].reshape(-1, 1) + + return (train_X, train_y, test_X, test_y) def _gen_sup_learning(self, data: np.ndarray, total_lag: int=1, nafill: object=0.0) \ - -> pd.DataFrame: + -> (pd.DataFrame, int): """ Generate superized learning problem. Transform the time series problem into a supervised learning @@ -107,11 +140,12 @@ def _gen_sup_learning(self, data: np.ndarray, total_lag: int=1, nafill: object=0 col_names = ["L0/current/target"] + \ [f"L{i}" for i in range(1, total_lag+1)] df.columns = col_names - return df + tar_idx = 0 + return (df, tar_idx) def _difference(self, data: np.ndarray, lag: int=1, order: int=1) -> np.ndarray: """ - Note: set lag=1 & order=0 to use the original data. + Note: set lag=1 & order=0 to use the original data. """ if order != 0: diff = list() @@ -123,17 +157,34 @@ def _difference(self, data: np.ndarray, lag: int=1, order: int=1) -> np.ndarray: return self._difference(diff, lag, order-1) return data - def _invert_difference(self, data: np.ndarray) -> np.ndarray: - pass - - def _scale(self, data: np.ndarray) \ - -> (sklearn.preprocessing.StandardScaler, np.ndarray): + def _invert_difference(self, data: np.ndarray, idx: int) -> np.ndarray: + """ + For initial stationarity removal order=1 only. + #TODO: add higher order of differencing support. Using recursion + """ + assert self.config["diff.order"] == 1, \ + "Initial stationarity removal differencing with order higher than 1 are not yet supported." + + lag = self.config["diff.lag"] + if idx - lag >= 0: + return self.raw[idx - lag + 1] + data + else: + return data + + def _scale(self, data: np.ndarray) -> ( + sklearn.preprocessing.StandardScaler, + np.ndarray): scaler = sklearn.preprocessing.StandardScaler().fit(data) data_scaled = scaler.transform(data) return scaler, data_scaled - def _invert_scale(self, data: np.ndarray): - pass + def _invert_scale_y(self, data: np.ndarray): + # Invert scale the output from model. + # Assert data type to be univariate time series data with shape (n,) or (n, 1) + assert len(data.shape) == 1 or (len(data.shape) == 2 and data.shape[1] == 1), \ + f"Unexpected data array feed, should be in shape (n,) or (n,1). Get {data.shape}" + data = data.reshape(-1, 1) + return self.scaler_out.inverse_transform(data) def reconstruct(self, data: np.ndarray): pass diff --git a/k models/exchange/ex_model.py b/k models/exchange/ex_model.py index bbbc492..630bba3 100644 --- a/k models/exchange/ex_model.py +++ b/k models/exchange/ex_model.py @@ -10,8 +10,12 @@ from matplotlib import pyplot as plt from sklearn.metrics import mean_squared_error from sklearn.preprocessing import MinMaxScaler +from pprint import pprint +import methods from methods import * +import containers +from containers import * config = { "batch_size": 1, @@ -22,10 +26,17 @@ } # Load dataset. -series = load_dataset(dir="../data/DEXCHUS.csv") +series = load_dataset( + dir="/Users/tianyudu/Documents/Github/AnnEcon/k models/exchange/DEXCHUS.csv") # Transform to stationary data to Delta 1 raw_values = series.values + +sample_data = np.array([x ** 2 for x in range(10)]).reshape(-1,1) + +c = UnivariateContainer(sample_data) +# cont = UnivariateContainer(raw_values.reshape(-1,1)) + # diff would have length = len(raw_value) - 1 as it's taking the gaps. diff_values = difference(raw_values, lag=1) @@ -43,6 +54,7 @@ f"Total sample found {total_sample_size}, {test_size} will be used as test set." ) train, test = sup[:-test_size], sup[-test_size:] +train2, test2 = sup[:train_size], sup[train_size:] # Generate scaler and scaling datasets. # scaler on input matrix(X) and output(y) diff --git a/k models/pandas/_libs/parsers.pyx b/k models/pandas/_libs/parsers.pyx new file mode 100644 index 0000000..e69de29 diff --git a/space.code-workspace b/space.code-workspace new file mode 100644 index 0000000..25033e6 --- /dev/null +++ b/space.code-workspace @@ -0,0 +1,11 @@ +{ + "folders": [ + { + "path": "k models" + }, + { + "path": "." + } + ], + "settings": {} +} \ No newline at end of file