From 7a81bc5d820c6886873bd9bc22eeaa370bf33a2b Mon Sep 17 00:00:00 2001 From: Erdogan Taskesen Date: Sun, 24 May 2020 16:38:03 +0200 Subject: [PATCH] input variable name changed sparse_data to onehot --- pca/__init__.py | 5 +++++ pca/examples.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++-- pca/pca.py | 43 ++++++++++++++++++++++++++--------------- setup.py | 2 +- 4 files changed, 83 insertions(+), 18 deletions(-) diff --git a/pca/__init__.py b/pca/__init__.py index 24b740e..34de9b2 100644 --- a/pca/__init__.py +++ b/pca/__init__.py @@ -1,5 +1,10 @@ from pca.pca import pca +from pca.pca import ( + import_example, + ) + + __author__ = 'Erdogan Tasksen' __email__ = 'erdogant@gmail.com' __version__ = '1.0.2' diff --git a/pca/examples.py b/pca/examples.py index 44c69d8..4056472 100644 --- a/pca/examples.py +++ b/pca/examples.py @@ -1,6 +1,7 @@ + # %% -# import pca -# print(pca.__version__) +import pca +print(pca.__version__) # %% from sklearn.datasets import load_iris @@ -29,6 +30,10 @@ model = pca() Xnew = model.norm(X) + + + + # %% X = pd.read_csv('D://GITLAB/MASTERCLASS/embeddings/data/TCGA_RAW.zip',compression='zip') metadata = pd.read_csv('D://GITLAB/MASTERCLASS/embeddings/data/metadata.csv', sep=';') @@ -75,6 +80,48 @@ model.scatter3d() ax = model.biplot3d(n_feat=20) +# %% Exmample with mixed dataset +import pca +# Import example +df = pca.import_example() + +# Transform data into one-hot +from df2onehot import df2onehot +y = df['Survived'].values +del df['Survived'] +del df['PassengerId'] +del df['Name'] +out = df2onehot(df) +X = out['onehot'].copy() +X.index = y + + +from pca import pca + +# Initialize +model1 = pca(normalize=False, onehot=False) +# Run model 1 +_=model1.fit_transform(X) +model1.plot() +model1.biplot(n_feat=3) +model1.scatter() +model1.biplot3d(n_feat=3) + +# Initialize +model2 = pca(normalize=True, onehot=False) +# Run model 2 +_=model2.fit_transform(X) +model2.plot() +model2.biplot(n_feat=4) +model2.scatter() +model2.biplot3d(n_feat=3) + +# Initialize +model3 = pca(normalize=False, onehot=True) +# Run model 2 +_=model3.fit_transform(X) +model3.biplot(n_feat=3) + # %% # # EXAMPLE # import pca diff --git a/pca/pca.py b/pca/pca.py index 16e24ec..64bf5f6 100644 --- a/pca/pca.py +++ b/pca/pca.py @@ -16,16 +16,17 @@ from mpl_toolkits.mplot3d import Axes3D import scipy.sparse as sp import colourmap as colourmap - +import os +import wget # %% Association learning across all variables class pca(): - def __init__(self, n_components=0.95, n_feat=25, sparse_data=False, normalize=False, random_state=None): + def __init__(self, n_components=0.95, n_feat=25, onehot=False, normalize=False, random_state=None): """Initialize pca with user-defined parameters. Parameters ---------- - sparse_data : [Bool] optional, (default: False) + onehot : [Bool] optional, (default: False) Boolean: Set True if X is a sparse data set such as the output of a tfidf model. Many zeros and few numbers. Note this is different then a sparse matrix. Sparse data can be in a sparse matrix. n_components : [0,..,1] or [1,..number of samples-1], (default: 0.95) Number of TOP components to be returned. Values>0 are the number of components. Values<0 are the components that covers at least the percentage of variance. @@ -39,7 +40,7 @@ def __init__(self, n_components=0.95, n_feat=25, sparse_data=False, normalize=Fa """ # Store in object self.n_components = n_components - self.sparse_data = sparse_data + self.onehot = onehot self.normalize = normalize self.random_state = random_state self.n_feat = n_feat @@ -101,12 +102,12 @@ def fit_transform(self, X, row_labels=None, col_labels=None, verbose=3): if self.n_components<1: pcp = self.n_components # Run with all components to get all PCs back. This is needed for the step after. - model_pca, PC, loadings, percentExplVar = _explainedvar(X, n_components=None, sparse_data=self.sparse_data, random_state=self.random_state) + model_pca, PC, loadings, percentExplVar = _explainedvar(X, n_components=None, onehot=self.onehot, random_state=self.random_state) # Take number of components with minimal [n_components] explained variance self.n_components = np.min(np.where(percentExplVar >= self.n_components)[0]) + 1 if verbose>=3: print('[pca] >Number of components is [%d] that covers the [%.2f%%] explained variance.' %(self.n_components, pcp*100)) else: - model_pca, PC, loadings, percentExplVar = _explainedvar(X, n_components=self.n_components, sparse_data=self.sparse_data, random_state=self.random_state) + model_pca, PC, loadings, percentExplVar = _explainedvar(X, n_components=self.n_components, onehot=self.onehot, random_state=self.random_state) pcp = percentExplVar[np.minimum(len(percentExplVar)-1, self.n_components)] # Combine components relations with features. @@ -208,7 +209,10 @@ def _preprocessing(self, X, row_labels, col_labels, verbose=3): if verbose>=3: print('[pca] >n_components is set to %d' %(self.n_components)) self.n_feat = np.min([self.n_feat, X.shape[1]]) - + + if (not self.onehot) and (not self.normalize) and (str(X.values.dtype)=='bool'): + if verbose>=2: print('[pca] >Warning: Sparse or one-hot boolean input data is detected, it is highly recommended to set onehot=True or alternatively, normalize=True') + # if sp.issparse(X): # if verbose>=1: print('[PCA] Error: A sparse matrix was passed, but dense data is required for method=barnes_hut. Use X.toarray() to convert to a dense numpy array if the array is small enough for it to fit in memory.') if isinstance(X, pd.DataFrame): @@ -234,15 +238,24 @@ def _preprocessing(self, X, row_labels, col_labels, verbose=3): # normalize data if self.normalize: - if verbose>=3: print('[pca] >Normalizing input data..') - X = preprocessing.scale(X) - # fig,ax =plt.subplots(figsize=(10,6)) - # ax.hist(X.values.flatten(), bins=50) - # ax.set_ylabel('frequency') - # ax.set_xlabel('Genomic values') - # ax.grid(True) + if verbose>=3: print('[pca] >Normalizing input data per feature (zero mean)..') + # Plot the data distribution + # fig,(ax1,ax2)=plt.subplots(1,2, figsize=(15,5)) + # ax1.hist(X.ravel().astype(float), bins=50) + # ax1.set_ylabel('frequency') + # ax1.set_xlabel('Values') + # ax1.set_title('RAW') + # ax1.grid(True) + + X = preprocessing.scale(X, with_mean=True, with_std=True, axis=0) + + # Plot the data distribution + # ax2.hist(X.ravel().astype(float), bins=50) + # ax2.set_ylabel('frequency') + # ax2.set_xlabel('Values') + # ax2.set_title('Zero-mean with unit variance normalized') + # ax2.grid(True) - # Return return(X, row_labels, col_labels) diff --git a/setup.py b/setup.py index 6d73894..a432304 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ with open("README.md", "r") as fh: long_description = fh.read() setuptools.setup( - install_requires=['matplotlib','numpy','sklearn','scipy','colourmap','pandas','tqdm'], + install_requires=['wget','matplotlib','numpy','sklearn','scipy','colourmap','pandas','tqdm'], python_requires='>=3', name='pca', version=new_version,