Principle Component Analysis

PCA : CVD

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.decomposition import PCA
from scipy.stats import norm
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from scipy.cluster.hierarchy import dendrogram, linkage

sns.set(font_scale=1.8)

Data

data = pd.read_csv('../datareader/score/score.csv')
data = data.set_index('Protein')
def rearrang(olddf):
    col = ['CVA','IHD','CM','ARR','VD','CHD']
    newdf = pd.DataFrame()
    for t in col:
        newdf[t]= olddf[t]
    return newdf
ndf = rearrang(data)
ndf.head()

	CVA	IHD	CM	ARR	VD	CHD
Protein
small_ubiquitin-related_modifier_1	0.041144	0.012216	0.078019	0.000000	0.000000	0.024314
metalloproteinase_inhibitor_4	0.042887	0.054740	0.095265	0.045032	0.034227	0.005072
aromatic-l-amino-acid_decarboxylase	0.055959	0.010260	0.011459	0.070661	0.000000	0.007809
nadph_oxidase_activator_1	0.035732	0.000000	0.000000	0.000000	0.000000	0.000000
tumor_necrosis_factor_ligand_superfamily_member_14	0.035732	0.000000	0.000000	0.000000	0.000000	0.000000

def feature_norm(df):
    dff = df.copy(deep =True)
    fchr = ['CVA','IHD','CM','ARR','VD','CHD']
    for t in fchr:
        dff[t] = (df[t]-df[t].min())/(df[t].max()-df[t].min())
    return dff
ndfn = feature_norm(ndf)
ndfn.head()

	CVA	IHD	CM	ARR	VD	CHD
Protein
small_ubiquitin-related_modifier_1	0.059894	0.029087	0.226948	0.000000	0.000000	0.040827
metalloproteinase_inhibitor_4	0.062432	0.130334	0.277115	0.147418	0.093665	0.008517
aromatic-l-amino-acid_decarboxylase	0.081461	0.024430	0.033333	0.231319	0.000000	0.013113
nadph_oxidase_activator_1	0.052016	0.000000	0.000000	0.000000	0.000000	0.000000
tumor_necrosis_factor_ligand_superfamily_member_14	0.052016	0.000000	0.000000	0.000000	0.000000	0.000000

Biplot: VCD

cvddata = ndf.copy(deep =True)
# TODO: Apply PCA by fitting the good data with only two dimensions
pca = PCA(n_components =2)
pca.fit(cvddata.T)


# TODO: Transform the good data using the PCA fit above
reduced_data = pca.transform(cvddata.T)


# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data)

reduced_data

	0	1
0	2.729121	-1.000078
1	0.659602	2.072654
2	-0.460644	0.529615
3	-0.871578	-0.387538
4	-0.961345	-0.462087
5	-1.095155	-0.752566

def biplot(reduced_data, pca,fname):


    names  = ['CVA','IHD','CM','ARR','VD','CHD']
    clrs =['navy','green','firebrick','mediumslateblue','darkgoldenrod', 'deepskyblue']



    fig, ax = plt.subplots(figsize = (10,10))

    # scatterplot of the reduced CVDs  
    ax.scatter(x=reduced_data.loc[:,0]*50,\
               y=reduced_data.loc[:,1]*50, 
               facecolors=clrs,\
               edgecolors='b',\
               s=2000,\
               alpha=0.5)


    for i,x,y in zip([0,1,2,3,4,5],reduced_data.loc[:,0]*50,reduced_data.loc[:,1]*50):
        ax.annotate(names[i], xy=(x-6.0, y-2.0), xytext=(x-6.0, y-2.0),fontsize = 15)


    feature_vectors = pca.components_.T


    # we use scaling factors to make the arrows easier to see
    asize, tpos = 700, 500,


    # projections of the original features
    for i, v in enumerate(feature_vectors):
           ax.arrow(0, 0, v[0]*asize, v[1]*asize, 
                          head_width=0.01,\
                          head_length=0.02,\
                          linewidth=0.1,\
                          color='red')


    plt.axis([-90, 150, -90, 120])
    ax.set_xlabel("Dimension 1", fontsize=20)
    ax.set_ylabel("Dimension 2", fontsize=20)
    ax.set_title("PC plane with original feature projections.", fontsize=16);
    plt.axhline(y=0, color='k')
    plt.axvline(x=0, color='k')
    plt.savefig(fname)
    return ax

# Create a biplot
biplot(reduced_data, pca, fname = 'CVD-biplot.pdf')

<matplotlib.axes._subplots.AxesSubplot at 0x7f10760c59b0>

png

Normalized data PCA

cvddata = ndfn.copy(deep =True)
# TODO: Apply PCA by fitting the good data with only two dimensions
pca = PCA(n_components =2)
pca.fit(cvddata.T)


# TODO: Transform the good data using the PCA fit above
reduced_data = pca.transform(cvddata.T)


# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data)

def nbiplot(reduced_data, pca,fname):


    names  = ['CVA','IHD','CM','ARR','VD','CHD']
    clrs =['navy','green','firebrick','mediumslateblue','darkgoldenrod', 'deepskyblue']



    fig, ax = plt.subplots(figsize = (10,10))

    # scatterplot of the reduced CVDs  
    ax.scatter(x=reduced_data.loc[:,0]*20,\
               y=reduced_data.loc[:,1]*20, 
               facecolors=clrs,\
               edgecolors='b',\
               s=2000,\
               alpha=0.5)


    for i,x,y in zip([0,1,2,3,4,5],reduced_data.loc[:,0]*20,reduced_data.loc[:,1]*20):
        ax.annotate(names[i], xy=(x-3.0, y-1.0), xytext=(x-3.0, y-1.0),fontsize = 15)


    feature_vectors = pca.components_.T


    # we use scaling factors to make the arrows easier to see
    asize, tpos = 700, 500,


    # projections of the original features
    for i, v in enumerate(feature_vectors):
           ax.arrow(0, 0, v[0]*asize, v[1]*asize, 
                          head_width=0.01,\
                          head_length=0.02,\
                          linewidth=0.1,\
                          color='red')


    plt.axis([-90, 150, -90, 120])
    ax.set_xlabel("Dimension 1", fontsize=20)
    ax.set_ylabel("Dimension 2", fontsize=20)
    ax.set_title("PC plane with original feature projections.", fontsize=16);
    plt.axhline(y=0, color='k')
    plt.axvline(x=0, color='k')
    plt.savefig(fname)
    return ax

# Create a biplot
nbiplot(reduced_data, pca, fname = 'CVD-nbiplot.pdf')

<matplotlib.axes._subplots.AxesSubplot at 0x7f107737d518>

png