Skip to content

Commit

Permalink
complete UnivariateContainer class and change input feed from ndarray…
Browse files Browse the repository at this point in the history
… to series
  • Loading branch information
TianyuDu committed Sep 1, 2018
1 parent 9c11322 commit 04a1ad3
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 37 deletions.
Binary file modified k models/exchange/__pycache__/containers.cpython-36.pyc
Binary file not shown.
Binary file modified k models/exchange/__pycache__/methods.cpython-36.pyc
Binary file not shown.
123 changes: 87 additions & 36 deletions k models/exchange/containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from typing import Tuple, Union


class BaseContainer():
def check_config(self, cf: dict) -> bool:
# TODO: Add config check
print("Configuration check passed.")
return True

Expand All @@ -29,69 +31,100 @@ class UnivariateContainer(BaseContainer):

def __init__(
self,
raw: np.ndarray,
series: pd.Series,
config: dict={
"method": "diff",
"diff_lag": 1,
"diff_order": 1,
"diff.lag": 1,
"diff.order": 1,
"test_ratio": 0.2,
"lag_for_sup": 32
"lag_for_sup": 3,
"target_idx": 0
}):
assert self.check_config(config)
self.config = config
# Input format: num_obs * num_fea
assert len(raw.shape) == 2, \
f"UnivariateDataContainer: Raw value is expected to have dim=2, dim={len(raw.shape)} received instead."
assert len(series.shape) == 1, \
f"UnivariateDataContainer: Series feed is expected to have shape=(n,) as univariate series, \
but shape={len(series.shape)} received instead."
self.series = series
self.raw = self.series.values.reshape(-1, 1)
print(
f"UnivariateDataContainer: Raw values received: {raw.shape[0]} observations with {raw.shape[1]} features.")
self.num_obs, self.num_fea = raw.shape
f"UnivariateDataContainer: Univariate series with {len(self.series)} obs received.")
self.num_obs, self.num_fea = self.raw.shape
self.differenced = self._difference(
raw, lag=self.config["diff_lag"], order=self.config["diff_order"]
self.raw, lag=self.config["diff.lag"], order=self.config["diff.order"]
)

self.sup_set = self._gen_sup_learning(
self.sup_set, self.tar_idx = self._gen_sup_learning(
self.differenced, total_lag=self.config["lag_for_sup"])
print(
f"Supervised Learning Problem Generated with target index {self.tar_idx}")
self.sup_set = self.sup_set.values
self.sup_fea = self.sup_set.shape[1]

self.sample_size = len(self.sup)
self.sample_size = len(self.sup_set)
self.test_size = int(self.sample_size * config["test_ratio"])
self.train_size = int(self.sample_size - self.test_size)

# Split data
# Note: idx0 = obs idx, idx1 = feature idx
# Note: feature idx = 0 --> target
self.train_X, self.train_y, self.test_X, self.test_y = self._split_data(self.sup)

print(f"""
Test and training data spliting finished:

self.train_X, self.train_y, self.test_X, self.test_y \
= self._split_data(
self.sup_set,
tar_idx=self.tar_idx
)

print(f"""\tTest and training data spliting finished:
train X shape: {self.train_X.shape},
train y shape: {self.train_y.shape},
test X shape: {self.test_X.shape},
test y shape: {self.test_y.shape}""")

# Scale the training data.
# self.scaler, self.diff_scaled = self._scale(self.differenced)
# Scaler are created based on training data set. NOT the whole dataset.
self.scaler_in, self.train_X_scaled = self._scale(self.train_X)
self.scaler_out, self.train_y_scaled = self._scale(self.train_y)

self.test_X_scaled = self.scaler_in.transform(self.test_X)
self.test_y_sacled = self.scaler_out.transform(self.test_y)

def __str__(self) -> str:
repr_str = f"""Univariate Data Contrainer at {hex(id(self))}
with {self.num_obs} obs and {self.num_fea} features,
Supervised Learning problem generated.
Total sample size: {self.sample_size} obs.
Training set size: {self.train_size} obs.
Testing set size: {self.test_size} obs.
# TODO: Add shapes of train/test Xy sets to report string.
repr_str = f"""\t{str(type(self))} object at {hex(id(self))}
Raw Data:
Dataset size: {self.num_obs} obs.
Number of features: {self.num_fea} features.
Supervised Learning problem generated:
Total sample size: {self.sample_size} obs.
Training set size: {self.train_size} obs.
Testing set size: {self.test_size} obs.
"""
return repr_str

def __repr__(self):
self.__str__()
return self.__str__()

def _split_data(self, data: np.ndarray, tar_idx: Union[int, list]=0) -> Tuple[np.ndarray]:
# Returing order: (train_X, train_y, test_X, test_y), univariate.
train, test = data[:self.train_size], data[self.train_size:]

assert train.shape[1] == self.sup_fea, \
f"Got train shape: {train.shape}, expected feature: {self.sup_fea}"
fea_idx = list(range(train.shape[1]))
fea_idx.remove(tar_idx)

def _split_data(self):
pass # FIXME: Stopped here Sep. 1 2018
train_X = train[:, fea_idx]
train_y = train[:, tar_idx].reshape(-1, 1)

test_X = test[:, fea_idx]
test_y = test[:, tar_idx].reshape(-1, 1)

return (train_X, train_y, test_X, test_y)

def _gen_sup_learning(self, data: np.ndarray, total_lag: int=1, nafill: object=0.0) \
-> pd.DataFrame:
-> (pd.DataFrame, int):
"""
Generate superized learning problem.
Transform the time series problem into a supervised learning
Expand All @@ -107,11 +140,12 @@ def _gen_sup_learning(self, data: np.ndarray, total_lag: int=1, nafill: object=0
col_names = ["L0/current/target"] + \
[f"L{i}" for i in range(1, total_lag+1)]
df.columns = col_names
return df
tar_idx = 0
return (df, tar_idx)

def _difference(self, data: np.ndarray, lag: int=1, order: int=1) -> np.ndarray:
"""
Note: set lag=1 & order=0 to use the original data.
Note: set lag=1 & order=0 to use the original data.
"""
if order != 0:
diff = list()
Expand All @@ -123,17 +157,34 @@ def _difference(self, data: np.ndarray, lag: int=1, order: int=1) -> np.ndarray:
return self._difference(diff, lag, order-1)
return data

def _invert_difference(self, data: np.ndarray) -> np.ndarray:
pass

def _scale(self, data: np.ndarray) \
-> (sklearn.preprocessing.StandardScaler, np.ndarray):
def _invert_difference(self, data: np.ndarray, idx: int) -> np.ndarray:
"""
For initial stationarity removal order=1 only.
#TODO: add higher order of differencing support. Using recursion
"""
assert self.config["diff.order"] == 1, \
"Initial stationarity removal differencing with order higher than 1 are not yet supported."

lag = self.config["diff.lag"]
if idx - lag >= 0:
return self.raw[idx - lag + 1] + data
else:
return data

def _scale(self, data: np.ndarray) -> (
sklearn.preprocessing.StandardScaler,
np.ndarray):
scaler = sklearn.preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)
return scaler, data_scaled

def _invert_scale(self, data: np.ndarray):
pass
def _invert_scale_y(self, data: np.ndarray):
# Invert scale the output from model.
# Assert data type to be univariate time series data with shape (n,) or (n, 1)
assert len(data.shape) == 1 or (len(data.shape) == 2 and data.shape[1] == 1), \
f"Unexpected data array feed, should be in shape (n,) or (n,1). Get {data.shape}"
data = data.reshape(-1, 1)
return self.scaler_out.inverse_transform(data)

def reconstruct(self, data: np.ndarray):
pass
14 changes: 13 additions & 1 deletion k models/exchange/ex_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,12 @@
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from pprint import pprint

import methods
from methods import *
import containers
from containers import *

config = {
"batch_size": 1,
Expand All @@ -22,10 +26,17 @@
}

# Load dataset.
series = load_dataset(dir="../data/DEXCHUS.csv")
series = load_dataset(
dir="/Users/tianyudu/Documents/Github/AnnEcon/k models/exchange/DEXCHUS.csv")

# Transform to stationary data to Delta 1
raw_values = series.values

sample_data = np.array([x ** 2 for x in range(10)]).reshape(-1,1)

c = UnivariateContainer(sample_data)
# cont = UnivariateContainer(raw_values.reshape(-1,1))

# diff would have length = len(raw_value) - 1 as it's taking the gaps.
diff_values = difference(raw_values, lag=1)

Expand All @@ -43,6 +54,7 @@
f"Total sample found {total_sample_size}, {test_size} will be used as test set."
)
train, test = sup[:-test_size], sup[-test_size:]
train2, test2 = sup[:train_size], sup[train_size:]

# Generate scaler and scaling datasets.
# scaler on input matrix(X) and output(y)
Expand Down
Empty file.
11 changes: 11 additions & 0 deletions space.code-workspace
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"folders": [
{
"path": "k models"
},
{
"path": "."
}
],
"settings": {}
}

0 comments on commit 04a1ad3

Please sign in to comment.