From 7a81bc5d820c6886873bd9bc22eeaa370bf33a2b Mon Sep 17 00:00:00 2001
From: Erdogan Taskesen <erdogant@gmail.com>
Date: Sun, 24 May 2020 16:38:03 +0200
Subject: [PATCH] input variable name changed sparse_data to onehot

---
 pca/__init__.py |  5 +++++
 pca/examples.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++--
 pca/pca.py      | 43 ++++++++++++++++++++++++++---------------
 setup.py        |  2 +-
 4 files changed, 83 insertions(+), 18 deletions(-)

diff --git a/pca/__init__.py b/pca/__init__.py
index 24b740e..34de9b2 100644
--- a/pca/__init__.py
+++ b/pca/__init__.py
@@ -1,5 +1,10 @@
 from pca.pca import pca
 
+from pca.pca import (
+    import_example,
+    )
+
+
 __author__ = 'Erdogan Tasksen'
 __email__ = 'erdogant@gmail.com'
 __version__ = '1.0.2'
diff --git a/pca/examples.py b/pca/examples.py
index 44c69d8..4056472 100644
--- a/pca/examples.py
+++ b/pca/examples.py
@@ -1,6 +1,7 @@
+
 # %%
-# import pca
-# print(pca.__version__)
+import pca
+print(pca.__version__)
 
 # %%
 from sklearn.datasets import load_iris
@@ -29,6 +30,10 @@
 model = pca()
 Xnew = model.norm(X)
 
+
+
+
+
 # %%
 X = pd.read_csv('D://GITLAB/MASTERCLASS/embeddings/data/TCGA_RAW.zip',compression='zip')
 metadata = pd.read_csv('D://GITLAB/MASTERCLASS/embeddings/data/metadata.csv', sep=';')
@@ -75,6 +80,48 @@
 model.scatter3d()
 ax = model.biplot3d(n_feat=20)
 
+# %% Exmample with mixed dataset
+import pca
+# Import example
+df = pca.import_example()
+
+# Transform data into one-hot
+from df2onehot import df2onehot
+y = df['Survived'].values
+del df['Survived']
+del df['PassengerId']
+del df['Name']
+out = df2onehot(df)
+X = out['onehot'].copy()
+X.index = y
+
+
+from pca import pca
+
+# Initialize
+model1 = pca(normalize=False, onehot=False)
+# Run model 1
+_=model1.fit_transform(X)
+model1.plot()
+model1.biplot(n_feat=3)
+model1.scatter()
+model1.biplot3d(n_feat=3)
+
+# Initialize
+model2 = pca(normalize=True, onehot=False)
+# Run model 2
+_=model2.fit_transform(X)
+model2.plot()
+model2.biplot(n_feat=4)
+model2.scatter()
+model2.biplot3d(n_feat=3)
+
+# Initialize
+model3 = pca(normalize=False, onehot=True)
+# Run model 2
+_=model3.fit_transform(X)
+model3.biplot(n_feat=3)
+
 # %%
 # # EXAMPLE
 # import pca
diff --git a/pca/pca.py b/pca/pca.py
index 16e24ec..64bf5f6 100644
--- a/pca/pca.py
+++ b/pca/pca.py
@@ -16,16 +16,17 @@
 from mpl_toolkits.mplot3d import Axes3D
 import scipy.sparse as sp
 import colourmap as colourmap
-
+import os
+import wget
 
 # %% Association learning across all variables
 class pca():
-    def __init__(self, n_components=0.95, n_feat=25, sparse_data=False, normalize=False, random_state=None):
+    def __init__(self, n_components=0.95, n_feat=25, onehot=False, normalize=False, random_state=None):
         """Initialize pca with user-defined parameters.
 
         Parameters
         ----------
-        sparse_data : [Bool] optional, (default: False)
+        onehot : [Bool] optional, (default: False)
             Boolean: Set True if X is a sparse data set such as the output of a tfidf model. Many zeros and few numbers. Note this is different then a sparse matrix. Sparse data can be in a sparse matrix.
         n_components : [0,..,1] or [1,..number of samples-1], (default: 0.95)
             Number of TOP components to be returned. Values>0 are the number of components. Values<0 are the components that covers at least the percentage of variance.
@@ -39,7 +40,7 @@ def __init__(self, n_components=0.95, n_feat=25, sparse_data=False, normalize=Fa
         """
         # Store in object
         self.n_components = n_components
-        self.sparse_data = sparse_data
+        self.onehot = onehot
         self.normalize = normalize
         self.random_state = random_state
         self.n_feat = n_feat
@@ -101,12 +102,12 @@ def fit_transform(self, X, row_labels=None, col_labels=None, verbose=3):
         if self.n_components<1:
             pcp = self.n_components
             # Run with all components to get all PCs back. This is needed for the step after.
-            model_pca, PC, loadings, percentExplVar = _explainedvar(X, n_components=None, sparse_data=self.sparse_data, random_state=self.random_state)
+            model_pca, PC, loadings, percentExplVar = _explainedvar(X, n_components=None, onehot=self.onehot, random_state=self.random_state)
             # Take number of components with minimal [n_components] explained variance
             self.n_components = np.min(np.where(percentExplVar >= self.n_components)[0]) + 1
             if verbose>=3: print('[pca] >Number of components is [%d] that covers the [%.2f%%] explained variance.' %(self.n_components, pcp*100))
         else:
-            model_pca, PC, loadings, percentExplVar = _explainedvar(X, n_components=self.n_components, sparse_data=self.sparse_data, random_state=self.random_state)
+            model_pca, PC, loadings, percentExplVar = _explainedvar(X, n_components=self.n_components, onehot=self.onehot, random_state=self.random_state)
             pcp = percentExplVar[np.minimum(len(percentExplVar)-1, self.n_components)]
 
         # Combine components relations with features.
@@ -208,7 +209,10 @@ def _preprocessing(self, X, row_labels, col_labels, verbose=3):
             if verbose>=3: print('[pca] >n_components is set to %d' %(self.n_components))
 
         self.n_feat = np.min([self.n_feat, X.shape[1]])
-    
+        
+        if (not self.onehot) and (not self.normalize) and (str(X.values.dtype)=='bool'):
+            if verbose>=2: print('[pca] >Warning: Sparse or one-hot boolean input data is detected, it is highly recommended to set onehot=True or alternatively, normalize=True')
+
         # if sp.issparse(X):
             # if verbose>=1: print('[PCA] Error: A sparse matrix was passed, but dense data is required for method=barnes_hut. Use X.toarray() to convert to a dense numpy array if the array is small enough for it to fit in memory.')
         if isinstance(X, pd.DataFrame):
@@ -234,15 +238,24 @@ def _preprocessing(self, X, row_labels, col_labels, verbose=3):
     
         # normalize data
         if self.normalize:
-            if verbose>=3: print('[pca] >Normalizing input data..')
-            X = preprocessing.scale(X)
-            # fig,ax =plt.subplots(figsize=(10,6))
-            # ax.hist(X.values.flatten(), bins=50)
-            # ax.set_ylabel('frequency')
-            # ax.set_xlabel('Genomic values')
-            # ax.grid(True)
+            if verbose>=3: print('[pca] >Normalizing input data per feature (zero mean)..')
+            # Plot the data distribution
+            # fig,(ax1,ax2)=plt.subplots(1,2, figsize=(15,5))
+            # ax1.hist(X.ravel().astype(float), bins=50)
+            # ax1.set_ylabel('frequency')
+            # ax1.set_xlabel('Values')
+            # ax1.set_title('RAW')
+            # ax1.grid(True)
+
+            X = preprocessing.scale(X, with_mean=True, with_std=True, axis=0)
+
+            # Plot the data distribution
+            # ax2.hist(X.ravel().astype(float), bins=50)
+            # ax2.set_ylabel('frequency')
+            # ax2.set_xlabel('Values')
+            # ax2.set_title('Zero-mean with unit variance normalized')
+            # ax2.grid(True)
 
-        # Return
         return(X, row_labels, col_labels)
 
 
diff --git a/setup.py b/setup.py
index 6d73894..a432304 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
 with open("README.md", "r") as fh:
     long_description = fh.read()
 setuptools.setup(
-     install_requires=['matplotlib','numpy','sklearn','scipy','colourmap','pandas','tqdm'],
+     install_requires=['wget','matplotlib','numpy','sklearn','scipy','colourmap','pandas','tqdm'],
      python_requires='>=3',
      name='pca',
      version=new_version,