Skip to content

Commit

Permalink
input variable name changed sparse_data to onehot
Browse files Browse the repository at this point in the history
  • Loading branch information
erdogant committed May 24, 2020
1 parent fd20585 commit 7a81bc5
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 18 deletions.
5 changes: 5 additions & 0 deletions pca/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from pca.pca import pca

from pca.pca import (
import_example,
)


__author__ = 'Erdogan Tasksen'
__email__ = 'erdogant@gmail.com'
__version__ = '1.0.2'
Expand Down
51 changes: 49 additions & 2 deletions pca/examples.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

# %%
# import pca
# print(pca.__version__)
import pca
print(pca.__version__)

# %%
from sklearn.datasets import load_iris
Expand Down Expand Up @@ -29,6 +30,10 @@
model = pca()
Xnew = model.norm(X)





# %%
X = pd.read_csv('D://GITLAB/MASTERCLASS/embeddings/data/TCGA_RAW.zip',compression='zip')
metadata = pd.read_csv('D://GITLAB/MASTERCLASS/embeddings/data/metadata.csv', sep=';')
Expand Down Expand Up @@ -75,6 +80,48 @@
model.scatter3d()
ax = model.biplot3d(n_feat=20)

# %% Exmample with mixed dataset
import pca
# Import example
df = pca.import_example()

# Transform data into one-hot
from df2onehot import df2onehot
y = df['Survived'].values
del df['Survived']
del df['PassengerId']
del df['Name']
out = df2onehot(df)
X = out['onehot'].copy()
X.index = y


from pca import pca

# Initialize
model1 = pca(normalize=False, onehot=False)
# Run model 1
_=model1.fit_transform(X)
model1.plot()
model1.biplot(n_feat=3)
model1.scatter()
model1.biplot3d(n_feat=3)

# Initialize
model2 = pca(normalize=True, onehot=False)
# Run model 2
_=model2.fit_transform(X)
model2.plot()
model2.biplot(n_feat=4)
model2.scatter()
model2.biplot3d(n_feat=3)

# Initialize
model3 = pca(normalize=False, onehot=True)
# Run model 2
_=model3.fit_transform(X)
model3.biplot(n_feat=3)

# %%
# # EXAMPLE
# import pca
Expand Down
43 changes: 28 additions & 15 deletions pca/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,17 @@
from mpl_toolkits.mplot3d import Axes3D
import scipy.sparse as sp
import colourmap as colourmap

import os
import wget

# %% Association learning across all variables
class pca():
def __init__(self, n_components=0.95, n_feat=25, sparse_data=False, normalize=False, random_state=None):
def __init__(self, n_components=0.95, n_feat=25, onehot=False, normalize=False, random_state=None):
"""Initialize pca with user-defined parameters.
Parameters
----------
sparse_data : [Bool] optional, (default: False)
onehot : [Bool] optional, (default: False)
Boolean: Set True if X is a sparse data set such as the output of a tfidf model. Many zeros and few numbers. Note this is different then a sparse matrix. Sparse data can be in a sparse matrix.
n_components : [0,..,1] or [1,..number of samples-1], (default: 0.95)
Number of TOP components to be returned. Values>0 are the number of components. Values<0 are the components that covers at least the percentage of variance.
Expand All @@ -39,7 +40,7 @@ def __init__(self, n_components=0.95, n_feat=25, sparse_data=False, normalize=Fa
"""
# Store in object
self.n_components = n_components
self.sparse_data = sparse_data
self.onehot = onehot
self.normalize = normalize
self.random_state = random_state
self.n_feat = n_feat
Expand Down Expand Up @@ -101,12 +102,12 @@ def fit_transform(self, X, row_labels=None, col_labels=None, verbose=3):
if self.n_components<1:
pcp = self.n_components
# Run with all components to get all PCs back. This is needed for the step after.
model_pca, PC, loadings, percentExplVar = _explainedvar(X, n_components=None, sparse_data=self.sparse_data, random_state=self.random_state)
model_pca, PC, loadings, percentExplVar = _explainedvar(X, n_components=None, onehot=self.onehot, random_state=self.random_state)
# Take number of components with minimal [n_components] explained variance
self.n_components = np.min(np.where(percentExplVar >= self.n_components)[0]) + 1
if verbose>=3: print('[pca] >Number of components is [%d] that covers the [%.2f%%] explained variance.' %(self.n_components, pcp*100))
else:
model_pca, PC, loadings, percentExplVar = _explainedvar(X, n_components=self.n_components, sparse_data=self.sparse_data, random_state=self.random_state)
model_pca, PC, loadings, percentExplVar = _explainedvar(X, n_components=self.n_components, onehot=self.onehot, random_state=self.random_state)
pcp = percentExplVar[np.minimum(len(percentExplVar)-1, self.n_components)]

# Combine components relations with features.
Expand Down Expand Up @@ -208,7 +209,10 @@ def _preprocessing(self, X, row_labels, col_labels, verbose=3):
if verbose>=3: print('[pca] >n_components is set to %d' %(self.n_components))

self.n_feat = np.min([self.n_feat, X.shape[1]])


if (not self.onehot) and (not self.normalize) and (str(X.values.dtype)=='bool'):
if verbose>=2: print('[pca] >Warning: Sparse or one-hot boolean input data is detected, it is highly recommended to set onehot=True or alternatively, normalize=True')

# if sp.issparse(X):
# if verbose>=1: print('[PCA] Error: A sparse matrix was passed, but dense data is required for method=barnes_hut. Use X.toarray() to convert to a dense numpy array if the array is small enough for it to fit in memory.')
if isinstance(X, pd.DataFrame):
Expand All @@ -234,15 +238,24 @@ def _preprocessing(self, X, row_labels, col_labels, verbose=3):

# normalize data
if self.normalize:
if verbose>=3: print('[pca] >Normalizing input data..')
X = preprocessing.scale(X)
# fig,ax =plt.subplots(figsize=(10,6))
# ax.hist(X.values.flatten(), bins=50)
# ax.set_ylabel('frequency')
# ax.set_xlabel('Genomic values')
# ax.grid(True)
if verbose>=3: print('[pca] >Normalizing input data per feature (zero mean)..')
# Plot the data distribution
# fig,(ax1,ax2)=plt.subplots(1,2, figsize=(15,5))
# ax1.hist(X.ravel().astype(float), bins=50)
# ax1.set_ylabel('frequency')
# ax1.set_xlabel('Values')
# ax1.set_title('RAW')
# ax1.grid(True)

X = preprocessing.scale(X, with_mean=True, with_std=True, axis=0)

# Plot the data distribution
# ax2.hist(X.ravel().astype(float), bins=50)
# ax2.set_ylabel('frequency')
# ax2.set_xlabel('Values')
# ax2.set_title('Zero-mean with unit variance normalized')
# ax2.grid(True)

# Return
return(X, row_labels, col_labels)


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
with open("README.md", "r") as fh:
long_description = fh.read()
setuptools.setup(
install_requires=['matplotlib','numpy','sklearn','scipy','colourmap','pandas','tqdm'],
install_requires=['wget','matplotlib','numpy','sklearn','scipy','colourmap','pandas','tqdm'],
python_requires='>=3',
name='pca',
version=new_version,
Expand Down

0 comments on commit 7a81bc5

Please sign in to comment.