Cluster: tSNE complete linkage

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from time import time
from scipy import ndimage
from sklearn import manifold, datasets
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
%matplotlib inline
sns.set(font_scale=1.8)

data = pd.read_csv('../../datareader/score/score.csv')
data.index = data['Protein']
ndf = data.drop('Protein',axis =1)
ndf.head()
ARR CHD CM CVA IHD VD
Protein
alpha-1-antitrypsin 0.009000 0.120724 0.036423 0.047402 0.103137 0.023655
mothers_against_decapentaplegic_homolog_4 0.005316 0.080604 0.056011 0.010512 0.039842 0.000000
vascular_endothelial_growth_factor_b 0.000000 0.006129 0.073377 0.028607 0.121075 0.000000
indoleamine_2,3-dioxygenase_1 0.000000 0.000000 0.000000 0.000000 0.034477 0.000000
thyroxine-binding_globulin 0.000000 0.036206 0.020516 0.006959 0.048815 0.024457

Which is Maximum

mdata = ndf.copy(deep =True)
dis = ['ARR', 'CHD', 'CM', 'CVA', 'IHD', 'VD']
idx = list(mdata.index)
nmax = []
for item in idx:
    data = mdata.loc[item,:]
    lst = [data[0],data[1],data[2],data[3],data[4],data[5]]
    m = max(lst)
    for d,e in zip(dis,lst):
        if e == m:
            nmax.append(d)

Cluster-Aglo: Complete

tdata =ndf.copy(deep =True)
X = np.array(tdata)
linkg = 'average'

Build the Manifold

# 2D embedding of the digits dataset
print("Computing embedding")
Xtsne = manifold.TSNE(n_components=2).fit_transform(X)
x_min, x_max = np.min(Xtsne, axis=0), np.max(Xtsne, axis=0)
Xtsne = (Xtsne - x_min) / (x_max - x_min)
print("Done.")
Computing embedding
Done.
Xtsne
array([[0.58949834, 0.7531166 ],
       [0.60703075, 0.74036276],
       [0.29339787, 0.49895537],
       ...,
       [0.49842042, 0.57060444],
       [0.97006494, 0.53362143],
       [0.02934294, 0.732129  ]], dtype=float32)

Results

def plot_clustering(Xtsne, labels, title=None):
    myclc = ['mediumslateblue','deepskyblue','firebrick','navy','green','darkgoldenrod']
    Color = []
    X = []
    Y = []
    plt.figure(figsize=(15, 12))
    for i in range(Xtsne.shape[0]):
        plt.text(Xtsne[i, 0], Xtsne[i, 1], str(labels[i]),
                 color= myclc[labels[i]],
                 fontdict={'weight': 'bold', 'size': 12})
        Color.append(myclc[labels[i]])
        X.append(Xtsne[i, 0])
        Y.append(Xtsne[i, 1])

    if title is not None:
        plt.title(title, size=20)
    plt.axis([-0.25, 1.25, -0.25, 1.25])

    #plt.axhline(y=0, color='k')
    #plt.axvline(x=0, color='k')

    plt.xlabel('dimension 1', fontsize =20)
    plt.ylabel('dimension 2',fontsize =20)
    plt.grid(True)
    plt.tight_layout()
    return X,Y,Color

myclc = ['mediumslateblue','deepskyblue','firebrick','navy','green','darkgoldenrod']
clustering = AgglomerativeClustering(linkage=linkg,n_clusters=6)

t0 = time()
clustering.fit(Xtsne)
print("%s : %.2fs" % (linkg, time() - t0))


X,Y,Color = plot_clustering(Xtsne, clustering.labels_,"%s linkage" % linkg)
plt.savefig('average/'+linkg+'.png')
plt.show()
average : 0.29s

png

len(clustering.labels_)
2702
resultdf = ndf.copy(deep =True)
resultdf['label'] = list(clustering.labels_)
resultdf['color'] = Color
resultdf['nmax'] = nmax
resultdf['X'] = X
resultdf['Y'] = Y
resultdf.head(5)
ARR CHD CM CVA IHD VD label color nmax X Y
Protein
alpha-1-antitrypsin 0.009000 0.120724 0.036423 0.047402 0.103137 0.023655 1 deepskyblue CHD 0.589498 0.753117
mothers_against_decapentaplegic_homolog_4 0.005316 0.080604 0.056011 0.010512 0.039842 0.000000 1 deepskyblue CHD 0.607031 0.740363
vascular_endothelial_growth_factor_b 0.000000 0.006129 0.073377 0.028607 0.121075 0.000000 4 green IHD 0.293398 0.498955
indoleamine_2,3-dioxygenase_1 0.000000 0.000000 0.000000 0.000000 0.034477 0.000000 5 darkgoldenrod IHD 0.949800 0.544562
thyroxine-binding_globulin 0.000000 0.036206 0.020516 0.006959 0.048815 0.024457 4 green IHD 0.399345 0.635551

Add UNiprot

data = pd.read_csv('uniprot.csv')
data.index = data['protein']
udf = data.drop('protein',axis =1)
udf.head()
uniprot_id gene_names has_multiple_IDs
protein
natriuretic_peptides_b P16860 NPPB NaN
angiotensin-converting_enzyme P12821 ACE DCP DCP1 NaN
potassium_voltage-gated_channel_subfamily_h_member_2 Q12809 KCNH2 ERG ERG1 HERG NaN
c-reactive_protein P02741 CRP PTX1 NaN
apolipoprotein_e P02649 APOE NaN
idx_cvd = list(resultdf.index)
idx_uprt =  list(udf.index)

uprot = []
for item in idx_cvd:
    if item in idx_uprt:
        uprot.append(udf.loc[item,:][0])
    else:
        print(item,'Match not Found')
        uprot.append('NAN')   
collectrin Match not Found
aldo-keto_reductase_family_1_member_c4 Match not Found
macrophage_receptor_marco Match not Found
zinc_fingers_and_homeoboxes_protein_2 Match not Found
transcription_factor_sp3 Match not Found
synaptotagmin-9 Match not Found
e3_ubiquitin-protein_ligase_nrdp1 Match not Found
nuclear_factor_erythroid_2-related_factor_1 Match not Found
serine/threonine-protein_phosphatase_2b_catalytic_subunit_gamma_isoform Match not Found
myocyte-specific_enhancer_factor_2a Match not Found
neuropeptide_s_receptor Match not Found
fidgetin Match not Found
histone_deacetylase_6 Match not Found
transcription_factor_sox-2 Match not Found
voltage-gated_potassium_channel_subunit_beta-1 Match not Found
segment_polarity_protein_dishevelled_homolog_dvl-3 Match not Found
mitochondrial_intermediate_peptidase Match not Found
gtp-binding_protein_rit1 Match not Found
oligodendrocyte_transcription_factor_1 Match not Found
ets_homologous_factor Match not Found
troponin_t,_slow_skeletal_muscle Match not Found
dna_repair_protein_rad51_homolog_2 Match not Found
xin_actin-binding_repeat-containing_protein_1 Match not Found
3-ketoacyl-coa_thiolase,_peroxisomal Match not Found
transcription_factor_sox-17 Match not Found
cytochrome_p450_2b6 Match not Found
cyclin-dependent_kinase_inhibitor_1 Match not Found
leukotriene_b4_receptor_1 Match not Found
trans-2,3-enoyl-coa_reductase-like Match not Found
apoptosis-inducing_factor_1,_mitochondrial Match not Found
frizzled-7 Match not Found
cytohesin-2 Match not Found
kinesin_heavy_chain_isoform_5a Match not Found
phosphatidylcholine:ceramide_cholinephosphotransferase_1 Match not Found
sulfiredoxin-1 Match not Found
ubiquitin_carboxyl-terminal_hydrolase_cyld Match not Found
ermin Match not Found
wolframin Match not Found
receptor-type_tyrosine-protein_phosphatase_beta Match not Found
micos_complex_subunit_mic26 Match not Found
neuropeptide_s Match not Found
basic_helix-loop-helix_transcription_factor_scleraxis Match not Found
transmembrane_6_superfamily_member_2 Match not Found
dna_repair-scaffolding_protein Match not Found
zinc_finger_protein_260 Match not Found
retinoic_acid_receptor_rxr-gamma Match not Found
electrogenic_sodium_bicarbonate_cotransporter_1 Match not Found
zinc_finger_e-box-binding_homeobox_1 Match not Found
beta-adrenergic_receptor_kinase_2 Match not Found
adp-ribosylation_factor_6 Match not Found
uroplakin-3a Match not Found
endoplasmic_reticulum_resident_protein_44 Match not Found
angiopoietin-related_protein_1 Match not Found
tgf-beta-activated_kinase_1_and_map3k7-binding_protein_1 Match not Found
rna-binding_protein_10 Match not Found
dna-binding_protein_satb1 Match not Found
cytosine-5)-methyltransferase_1 Match not Found
laminin_subunit_gamma-1 Match not Found
anthrax_toxin_receptor_2 Match not Found
protein_kinase_c_zeta_type Match not Found
cyclin-dependent_kinase_7 Match not Found
potassium_channel_subfamily_k_member_2 Match not Found
cytosolic_phospholipase_a2_gamma Match not Found
aquaporin-3 Match not Found
hyaluronidase_ph-20 Match not Found
crk-like_protein Match not Found
protein_kinase_c_eta_type Match not Found
angiopoietin-like_protein_8 Match not Found
fibronectin_type_iii_domain-containing_protein_5 Match not Found
voltage-dependent_l-type_calcium_channel_subunit_beta-2 Match not Found
eukaryotic_elongation_factor_2_kinase Match not Found
voltage-gated_potassium_channel_subunit_beta-2 Match not Found
complement_c1q_subcomponent_subunit_a Match not Found
glutamate_receptor_ionotropic,_delta-2 Match not Found
bromodomain-containing_protein_7 Match not Found
cytosine-5)-methyltransferase_3b Match not Found
ovarian_cancer_g-protein_coupled_receptor_1 Match not Found
anoctamin-6 Match not Found
complement_c4-b Match not Found
apoptosis_regulator_bcl-2 Match not Found
succinate_receptor_1 Match not Found
aldo-keto_reductase_family_1_member_b15 Match not Found
teratocarcinoma-derived_growth_factor_1 Match not Found
s-phase_kinase-associated_protein_1 Match not Found
talin-2 Match not Found
transcription_factor_mafb Match not Found
protein_kinase_c_iota_type Match not Found
t-box_transcription_factor_tbx20 Match not Found
nuclear_receptor_coactivator_7 Match not Found
regulator_of_g-protein_signaling_12 Match not Found
g-protein_coupled_receptor_family_c_group_6_member_a Match not Found
multiple_pdz_domain_protein Match not Found
b-cell_lymphoma_3_protein Match not Found
bromodomain-containing_protein_4 Match not Found
protein_kinase_c_theta_type Match not Found
cdgsh_iron-sulfur_domain-containing_protein_1 Match not Found
caspase_recruitment_domain-containing_protein_9 Match not Found
interleukin-10_receptor_subunit_alpha Match not Found
serine/threonine-protein_kinase_d3 Match not Found
ubiquitin-like_protein_isg15 Match not Found
e3_ubiquitin-protein_ligase_pellino_homolog_1 Match not Found
protein_max Match not Found
sonic_hedgehog_protein Match not Found
matrilin-2 Match not Found
src_kinase-associated_phosphoprotein_2 Match not Found
toll-like_receptor_5 Match not Found
mixed_lineage_kinase_domain-like_protein Match not Found
cyclic_amp-dependent_transcription_factor_atf-6_alpha Match not Found
proteasome_activator_complex_subunit_2 Match not Found
atp-dependent_dna_helicase_q1 Match not Found
sphingosine_1-phosphate_receptor_3 Match not Found
protein_phosphatase_1g Match not Found
hyaluronidase-3 Match not Found
serine/threonine-protein_kinase_d2 Match not Found
dual_oxidase_2 Match not Found
resultdf['uprot'] = uprot
resultdf.head(10)
ARR CHD CM CVA IHD VD label color nmax X Y uprot
Protein
alpha-1-antitrypsin 0.009000 0.120724 0.036423 0.047402 0.103137 0.023655 1 deepskyblue CHD 0.589498 0.753117 P01009
mothers_against_decapentaplegic_homolog_4 0.005316 0.080604 0.056011 0.010512 0.039842 0.000000 1 deepskyblue CHD 0.607031 0.740363 Q13485
vascular_endothelial_growth_factor_b 0.000000 0.006129 0.073377 0.028607 0.121075 0.000000 4 green IHD 0.293398 0.498955 P49765
indoleamine_2,3-dioxygenase_1 0.000000 0.000000 0.000000 0.000000 0.034477 0.000000 5 darkgoldenrod IHD 0.949800 0.544562 P14902
thyroxine-binding_globulin 0.000000 0.036206 0.020516 0.006959 0.048815 0.024457 4 green IHD 0.399345 0.635551 P05543
smoothelin 0.000000 0.010463 0.000000 0.020742 0.092131 0.013016 4 green IHD 0.262940 0.600811 P53814
short-chain_specific_acyl-coa_dehydrogenase,_mitochondrial 0.000000 0.000000 0.023999 0.000000 0.022102 0.000000 2 firebrick CM 0.658054 0.193892 P16219
protein_tyrosine_phosphatase_type_iva_1 0.030680 0.000000 0.022076 0.000000 0.000000 0.000000 1 deepskyblue ARR 0.580443 0.468921 Q93096
creatine_kinase_u-type,_mitochondrial 0.007622 0.000000 0.017123 0.000000 0.341386 0.000000 4 green IHD 0.147972 0.526543 P12532
c-c_chemokine_receptor_type_5 0.000000 0.000000 0.019980 0.050426 0.000000 0.000000 1 deepskyblue CVA 0.501680 0.598933 P51681
resultdf.to_csv('average/'+linkg+ '.csv')
df0 = resultdf[resultdf['label']==0]
df0.to_csv('average/'+linkg+ '0.csv')
df1 = resultdf[resultdf['label']==1]
df1.to_csv('average/'+linkg+ '1.csv')
df2 = resultdf[resultdf['label']==2]
df2.to_csv('average/'+linkg+ '2.csv')
df3 = resultdf[resultdf['label']==3]
df3.to_csv('average/'+linkg+ '3.csv')
df4 = resultdf[resultdf['label']==4]
df4.to_csv('average/'+linkg+ '4.csv')
df5 = resultdf[resultdf['label']==5]
df5.to_csv('average/'+linkg+ '5.csv')
df0.shape
(519, 12)

Seperate cluster plots

plt.figure(figsize = [22,17])
plt.subplot(2,3,1)
plt.scatter(df0['X'],df0['Y'],color=df0['color'])
plt.title('Cluster-label-0')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.subplot(2,3,2)
plt.scatter(df1['X'],df1['Y'],color=df1['color'])
plt.title('Cluster-label-1')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.subplot(2,3,3)
plt.scatter(df2['X'],df2['Y'],color=df2['color'])
plt.title('Cluster-label-2')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.subplot(2,3,4)
plt.scatter(df3['X'],df3['Y'],color=df3['color'])
plt.title('Cluster-label-3')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.subplot(2,3,5)
plt.scatter(df4['X'],df4['Y'],color=df4['color'])
plt.title('Cluster-label-4')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.subplot(2,3,6)
plt.scatter(df5['X'],df5['Y'],color=df5['color'])
plt.title('Cluster-label-5')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.savefig('average/'+linkg+'-subplot.png')
plt.show()

png

Cluster In deep

resultdf.groupby('label').count()
ARR CHD CM CVA IHD VD color nmax X Y uprot
label
0 519 519 519 519 519 519 519 519 519 519 519
1 676 676 676 676 676 676 676 676 676 676 676
2 287 287 287 287 287 287 287 287 287 287 287
3 112 112 112 112 112 112 112 112 112 112 112
4 911 911 911 911 911 911 911 911 911 911 911
5 197 197 197 197 197 197 197 197 197 197 197
df0.head(10)
ARR CHD CM CVA IHD VD label color nmax X Y uprot
Protein
sh3_and_multiple_ankyrin_repeat_domains_protein_3 0.00000 0.000000 0.000000 0.039151 0.000000 0.000000 0 mediumslateblue CVA 0.654033 0.959608 Q9BYB0
endophilin-b1 0.00000 0.000000 0.000000 0.043899 0.000000 0.000000 0 mediumslateblue CVA 0.516064 0.954616 Q9Y371
a_disintegrin_and_metalloproteinase_with_thrombospondin_motifs_12 0.00000 0.000000 0.000000 0.000000 0.062291 0.000000 0 mediumslateblue IHD 0.230749 0.748474 P58397
cytokine_receptor_common_subunit_gamma 0.00000 0.000000 0.000000 0.039151 0.000000 0.000000 0 mediumslateblue CVA 0.661077 0.969435 P31785
myelin_basic_protein 0.00000 0.007666 0.002121 0.349511 0.003877 0.003344 0 mediumslateblue CVA 0.352625 0.888755 P02686
phosphate-regulating_neutral_endopeptidase 0.00000 0.000000 0.000000 0.000000 0.062291 0.000000 0 mediumslateblue IHD 0.226173 0.747243 P78562
protein_z-dependent_protease_inhibitor 0.00000 0.000000 0.000000 0.065516 0.032020 0.000000 0 mediumslateblue CVA 0.432626 0.801987 Q9UK55
ste20-like_serine/threonine-protein_kinase 0.00000 0.000000 0.000000 0.040786 0.000000 0.000000 0 mediumslateblue CVA 0.545912 0.965827 Q9H2G2
tissue-type_plasminogen_activator 0.04285 0.013719 0.009599 0.330811 0.231620 0.006301 0 mediumslateblue CVA 0.335407 0.886174 P00750
interleukin-20_receptor_subunit_alpha 0.00000 0.000000 0.000000 0.039151 0.000000 0.000000 0 mediumslateblue CVA 0.653468 0.972735 Q9UHF4
df0[0:50].drop(['X','Y','label'],axis =1).plot.barh(stacked=True,figsize=(16, 25),fontsize =15 )
<matplotlib.axes._subplots.AxesSubplot at 0x7f93a8b73358>

png