Hierarchical Clustering

Hierarchical Clustering (with cutoff)

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import json

CVD= ['CVA','IHD','CM','ARR','VD','CHD']
clrs  = ['navy','green','firebrick',\
         'mediumslateblue','darkgoldenrod', 'deepskyblue']
def rearrang(olddf):
    col = ['CVA','IHD','CM','ARR','VD','CHD']
    newdf = pd.DataFrame()
    for t in col:
        newdf[t]= olddf[t]
    return newdf

with open('../../1.DATA/uniprot/protein2uniprot.json', 'r') as f:
    protein2uniprot = json.load(f)
print('all data:',len(protein2uniprot))

all data: 2869

data = pd.read_csv('../../1.DATA/score/score.csv')
data = data.set_index('Protein')
ndf = rearrang(data)
ndf.head(2)

	CVA	IHD	CM	ARR	VD	CHD
Protein
small_ubiquitin-related_modifier_1	0.041144	0.012216	0.078019	0.000000	0.000000	0.024314
metalloproteinase_inhibitor_4	0.042887	0.054740	0.095265	0.045032	0.034227	0.005072

ndf.shape

(2869, 6)

ndata = ndf.copy(deep = True)
ndf.describe()

	CVA	IHD	CM	ARR	VD	CHD
count	2869.000000	2869.000000	2869.000000	2869.000000	2869.000000	2869.000000
mean	0.040107	0.034860	0.026862	0.010698	0.007240	0.011746
std	0.060346	0.050428	0.038751	0.028798	0.024030	0.032627
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	0.022209	0.018526	0.013534	0.000000	0.000000	0.000000
75%	0.057229	0.045417	0.037451	0.007516	0.000000	0.009734
max	0.686945	0.419997	0.343774	0.305472	0.365424	0.595544

Set Cutoff

solid_cutoff = [0.15,0.15,0.15,0.15,0.15,0.15]

mdata = ndata.copy(deep =True)

dis = ['CVA','IHD','CM','ARR','VD','CHD']
idx = list(mdata.index)
data_dict = []

for item in idx:
    data = mdata.loc[item,:]
    lst =[data[0],data[1],data[2],data[3],data[4],data[5]]
    m = max(lst)
    for e,cut in zip(lst,solid_cutoff):
        if e == m:
            if e > cut:
                    data_dict.append({'protein':item,\
                     'CVA':data[0],\
                     'IHD':data[1],\
                     'CM':data[2],\
                     'ARR': data[3],\
                     'VD':data[4],\
                     'CHD':data[5]})

cdata = pd.DataFrame(data_dict)
cdata.index = cdata['protein']
cdata = cdata.drop('protein', axis =1)

cdata.shape

(313, 6)

Clustering

size=(50,50)
g = sns.clustermap(cdata.T.corr(),\
                   figsize=size,\
                   cmap = "YlGnBu",\
                   metric='seuclidean')



g.savefig('cutoff/cluster-cutoff-solid.pdf', format='pdf', dpi=400)

indx = g.dendrogram_row.reordered_ind

png

protein_cluster = []
for num in indx:
    for i,ndx in enumerate(cdata.index):
         if num == i:
                #print(i+1,ndx)
                protein_cluster.append({'id':i,"protein": ndx,\
                                        'ARR' : list(cdata.loc[ndx,:])[0],\
                                        'CHD':  list(cdata.loc[ndx,:])[1],\
                                        'CM' : list(cdata.loc[ndx,:])[2],\
                                        'CVA' : list(cdata.loc[ndx,:])[3],\
                                        'IHD' : list(cdata.loc[ndx,:])[4],\
                                        'VD' : list(cdata.loc[ndx,:])[5]})

protein_cluster_df = pd.DataFrame(protein_cluster)
protein_cluster_df = protein_cluster_df.set_index('protein')
protein_cluster_df = rearrang(protein_cluster_df)

protein_cluster_df.head(3)

	CVA	IHD	CM	ARR	VD	CHD
protein
methylenetetrahydrofolate_reductase	0.222450	0.155409	0.000390	0.005741	0.005021	0.100177
coagulation_factor_xii	0.167927	0.052323	0.000000	0.004614	0.000000	0.028020
matrix_metalloproteinase-9	0.245860	0.132204	0.072137	0.043005	0.061532	0.022252

Heatmap

protein_cluster_df.max()

CVA    0.686945
IHD    0.419997
CM     0.343774
ARR    0.305472
VD     0.365424
CHD    0.595544
dtype: float64

plt.figure(figsize = [22,22])
sns.heatmap(protein_cluster_df,\
            cmap="YlGnBu",\
            #cmap = sns.cubehelix_palette(1000),\
            #cmap = sns.cubehelix_palette(8, start=.5, rot=-.75),\
            #cmap = sns.color_palette("Blues"),\
            yticklabels=False,\
            vmin = 0.15,vmax = 0.70)
plt.savefig('cutoff/heatmap-cutoff-solid.pdf')

png

Barplot

protein_cluster_df.plot.barh(stacked=True,figsize=(50,100),color =clrs)
plt.gca().invert_yaxis()
plt.legend(fontsize =20)
plt.savefig('cutoff/barplot-cutoff-solid.pdf')

png

Final Result

U = []
index = list(protein_cluster_df.index)
for p in index:
    u = protein2uniprot[p]
    U.append(u)
protein_cluster_df['uniprot'] = U
protein_cluster_df.to_csv('cutoff/cluster-list-cutoff-solid.csv')