Cluster: tSNE complete linkage
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from time import time
from scipy import ndimage
from sklearn import manifold, datasets
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
%matplotlib inline
data = pd.read_csv('../../datareader/score/score.csv')
data.index = data['Protein']
ndf = data.drop('Protein',axis =1)
|
ARR |
CHD |
CM |
CVA |
IHD |
VD |
Protein |
|
|
|
|
|
|
alpha-1-antitrypsin |
0.009000 |
0.120724 |
0.036423 |
0.047402 |
0.103137 |
0.023655 |
mothers_against_decapentaplegic_homolog_4 |
0.005316 |
0.080604 |
0.056011 |
0.010512 |
0.039842 |
0.000000 |
vascular_endothelial_growth_factor_b |
0.000000 |
0.006129 |
0.073377 |
0.028607 |
0.121075 |
0.000000 |
indoleamine_2,3-dioxygenase_1 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.034477 |
0.000000 |
thyroxine-binding_globulin |
0.000000 |
0.036206 |
0.020516 |
0.006959 |
0.048815 |
0.024457 |
Which is Maximum
mdata = ndf.copy(deep =True)
dis = ['ARR', 'CHD', 'CM', 'CVA', 'IHD', 'VD']
idx = list(mdata.index)
nmax = []
for item in idx:
data = mdata.loc[item,:]
lst = [data[0],data[1],data[2],data[3],data[4],data[5]]
m = max(lst)
for d,e in zip(dis,lst):
if e == m:
nmax.append(d)
Cluster-Aglo: Complete
tdata =ndf.copy(deep =True)
Build the Manifold
# 2D embedding of the digits dataset
print("Computing embedding")
Xtsne = manifold.TSNE(n_components=2).fit_transform(X)
x_min, x_max = np.min(Xtsne, axis=0), np.max(Xtsne, axis=0)
Xtsne = (Xtsne - x_min) / (x_max - x_min)
print("Done.")
Computing embedding
Done.
array([[0.58949834, 0.7531166 ],
[0.60703075, 0.74036276],
[0.29339787, 0.49895537],
...,
[0.49842042, 0.57060444],
[0.97006494, 0.53362143],
[0.02934294, 0.732129 ]], dtype=float32)
Results
def plot_clustering(Xtsne, labels, title=None):
myclc = ['mediumslateblue','deepskyblue','firebrick','navy','green','darkgoldenrod']
Color = []
X = []
Y = []
plt.figure(figsize=(15, 12))
for i in range(Xtsne.shape[0]):
plt.text(Xtsne[i, 0], Xtsne[i, 1], str(labels[i]),
color= myclc[labels[i]],
fontdict={'weight': 'bold', 'size': 12})
Color.append(myclc[labels[i]])
X.append(Xtsne[i, 0])
Y.append(Xtsne[i, 1])
if title is not None:
plt.title(title, size=20)
plt.axis([-0.25, 1.25, -0.25, 1.25])
#plt.axhline(y=0, color='k')
#plt.axvline(x=0, color='k')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.grid(True)
plt.tight_layout()
return X,Y,Color
myclc = ['mediumslateblue','deepskyblue','firebrick','navy','green','darkgoldenrod']
clustering = AgglomerativeClustering(linkage=linkg,n_clusters=6)
t0 = time()
clustering.fit(Xtsne)
print("%s : %.2fs" % (linkg, time() - t0))
X,Y,Color = plot_clustering(Xtsne, clustering.labels_,"%s linkage" % linkg)
plt.savefig('average/'+linkg+'.png')
plt.show()
resultdf = ndf.copy(deep =True)
resultdf['label'] = list(clustering.labels_)
resultdf['color'] = Color
resultdf['nmax'] = nmax
resultdf['X'] = X
resultdf['Y'] = Y
|
ARR |
CHD |
CM |
CVA |
IHD |
VD |
label |
color |
nmax |
X |
Y |
Protein |
|
|
|
|
|
|
|
|
|
|
|
alpha-1-antitrypsin |
0.009000 |
0.120724 |
0.036423 |
0.047402 |
0.103137 |
0.023655 |
1 |
deepskyblue |
CHD |
0.589498 |
0.753117 |
mothers_against_decapentaplegic_homolog_4 |
0.005316 |
0.080604 |
0.056011 |
0.010512 |
0.039842 |
0.000000 |
1 |
deepskyblue |
CHD |
0.607031 |
0.740363 |
vascular_endothelial_growth_factor_b |
0.000000 |
0.006129 |
0.073377 |
0.028607 |
0.121075 |
0.000000 |
4 |
green |
IHD |
0.293398 |
0.498955 |
indoleamine_2,3-dioxygenase_1 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.034477 |
0.000000 |
5 |
darkgoldenrod |
IHD |
0.949800 |
0.544562 |
thyroxine-binding_globulin |
0.000000 |
0.036206 |
0.020516 |
0.006959 |
0.048815 |
0.024457 |
4 |
green |
IHD |
0.399345 |
0.635551 |
Add UNiprot
data = pd.read_csv('uniprot.csv')
data.index = data['protein']
udf = data.drop('protein',axis =1)
udf.head()
|
uniprot_id |
gene_names |
has_multiple_IDs |
protein |
|
|
|
natriuretic_peptides_b |
P16860 |
NPPB |
NaN |
angiotensin-converting_enzyme |
P12821 |
ACE DCP DCP1 |
NaN |
potassium_voltage-gated_channel_subfamily_h_member_2 |
Q12809 |
KCNH2 ERG ERG1 HERG |
NaN |
c-reactive_protein |
P02741 |
CRP PTX1 |
NaN |
apolipoprotein_e |
P02649 |
APOE |
NaN |
idx_cvd = list(resultdf.index)
idx_uprt = list(udf.index)
uprot = []
for item in idx_cvd:
if item in idx_uprt:
uprot.append(udf.loc[item,:][0])
else:
print(item,'Match not Found')
uprot.append('NAN')
collectrin Match not Found
aldo-keto_reductase_family_1_member_c4 Match not Found
macrophage_receptor_marco Match not Found
zinc_fingers_and_homeoboxes_protein_2 Match not Found
transcription_factor_sp3 Match not Found
synaptotagmin-9 Match not Found
e3_ubiquitin-protein_ligase_nrdp1 Match not Found
nuclear_factor_erythroid_2-related_factor_1 Match not Found
serine/threonine-protein_phosphatase_2b_catalytic_subunit_gamma_isoform Match not Found
myocyte-specific_enhancer_factor_2a Match not Found
neuropeptide_s_receptor Match not Found
fidgetin Match not Found
histone_deacetylase_6 Match not Found
transcription_factor_sox-2 Match not Found
voltage-gated_potassium_channel_subunit_beta-1 Match not Found
segment_polarity_protein_dishevelled_homolog_dvl-3 Match not Found
mitochondrial_intermediate_peptidase Match not Found
gtp-binding_protein_rit1 Match not Found
oligodendrocyte_transcription_factor_1 Match not Found
ets_homologous_factor Match not Found
troponin_t,_slow_skeletal_muscle Match not Found
dna_repair_protein_rad51_homolog_2 Match not Found
xin_actin-binding_repeat-containing_protein_1 Match not Found
3-ketoacyl-coa_thiolase,_peroxisomal Match not Found
transcription_factor_sox-17 Match not Found
cytochrome_p450_2b6 Match not Found
cyclin-dependent_kinase_inhibitor_1 Match not Found
leukotriene_b4_receptor_1 Match not Found
trans-2,3-enoyl-coa_reductase-like Match not Found
apoptosis-inducing_factor_1,_mitochondrial Match not Found
frizzled-7 Match not Found
cytohesin-2 Match not Found
kinesin_heavy_chain_isoform_5a Match not Found
phosphatidylcholine:ceramide_cholinephosphotransferase_1 Match not Found
sulfiredoxin-1 Match not Found
ubiquitin_carboxyl-terminal_hydrolase_cyld Match not Found
ermin Match not Found
wolframin Match not Found
receptor-type_tyrosine-protein_phosphatase_beta Match not Found
micos_complex_subunit_mic26 Match not Found
neuropeptide_s Match not Found
basic_helix-loop-helix_transcription_factor_scleraxis Match not Found
transmembrane_6_superfamily_member_2 Match not Found
dna_repair-scaffolding_protein Match not Found
zinc_finger_protein_260 Match not Found
retinoic_acid_receptor_rxr-gamma Match not Found
electrogenic_sodium_bicarbonate_cotransporter_1 Match not Found
zinc_finger_e-box-binding_homeobox_1 Match not Found
beta-adrenergic_receptor_kinase_2 Match not Found
adp-ribosylation_factor_6 Match not Found
uroplakin-3a Match not Found
endoplasmic_reticulum_resident_protein_44 Match not Found
angiopoietin-related_protein_1 Match not Found
tgf-beta-activated_kinase_1_and_map3k7-binding_protein_1 Match not Found
rna-binding_protein_10 Match not Found
dna-binding_protein_satb1 Match not Found
cytosine-5)-methyltransferase_1 Match not Found
laminin_subunit_gamma-1 Match not Found
anthrax_toxin_receptor_2 Match not Found
protein_kinase_c_zeta_type Match not Found
cyclin-dependent_kinase_7 Match not Found
potassium_channel_subfamily_k_member_2 Match not Found
cytosolic_phospholipase_a2_gamma Match not Found
aquaporin-3 Match not Found
hyaluronidase_ph-20 Match not Found
crk-like_protein Match not Found
protein_kinase_c_eta_type Match not Found
angiopoietin-like_protein_8 Match not Found
fibronectin_type_iii_domain-containing_protein_5 Match not Found
voltage-dependent_l-type_calcium_channel_subunit_beta-2 Match not Found
eukaryotic_elongation_factor_2_kinase Match not Found
voltage-gated_potassium_channel_subunit_beta-2 Match not Found
complement_c1q_subcomponent_subunit_a Match not Found
glutamate_receptor_ionotropic,_delta-2 Match not Found
bromodomain-containing_protein_7 Match not Found
cytosine-5)-methyltransferase_3b Match not Found
ovarian_cancer_g-protein_coupled_receptor_1 Match not Found
anoctamin-6 Match not Found
complement_c4-b Match not Found
apoptosis_regulator_bcl-2 Match not Found
succinate_receptor_1 Match not Found
aldo-keto_reductase_family_1_member_b15 Match not Found
teratocarcinoma-derived_growth_factor_1 Match not Found
s-phase_kinase-associated_protein_1 Match not Found
talin-2 Match not Found
transcription_factor_mafb Match not Found
protein_kinase_c_iota_type Match not Found
t-box_transcription_factor_tbx20 Match not Found
nuclear_receptor_coactivator_7 Match not Found
regulator_of_g-protein_signaling_12 Match not Found
g-protein_coupled_receptor_family_c_group_6_member_a Match not Found
multiple_pdz_domain_protein Match not Found
b-cell_lymphoma_3_protein Match not Found
bromodomain-containing_protein_4 Match not Found
protein_kinase_c_theta_type Match not Found
cdgsh_iron-sulfur_domain-containing_protein_1 Match not Found
caspase_recruitment_domain-containing_protein_9 Match not Found
interleukin-10_receptor_subunit_alpha Match not Found
serine/threonine-protein_kinase_d3 Match not Found
ubiquitin-like_protein_isg15 Match not Found
e3_ubiquitin-protein_ligase_pellino_homolog_1 Match not Found
protein_max Match not Found
sonic_hedgehog_protein Match not Found
matrilin-2 Match not Found
src_kinase-associated_phosphoprotein_2 Match not Found
toll-like_receptor_5 Match not Found
mixed_lineage_kinase_domain-like_protein Match not Found
cyclic_amp-dependent_transcription_factor_atf-6_alpha Match not Found
proteasome_activator_complex_subunit_2 Match not Found
atp-dependent_dna_helicase_q1 Match not Found
sphingosine_1-phosphate_receptor_3 Match not Found
protein_phosphatase_1g Match not Found
hyaluronidase-3 Match not Found
serine/threonine-protein_kinase_d2 Match not Found
dual_oxidase_2 Match not Found
resultdf['uprot'] = uprot
resultdf.head(10)
|
ARR |
CHD |
CM |
CVA |
IHD |
VD |
label |
color |
nmax |
X |
Y |
uprot |
Protein |
|
|
|
|
|
|
|
|
|
|
|
|
alpha-1-antitrypsin |
0.009000 |
0.120724 |
0.036423 |
0.047402 |
0.103137 |
0.023655 |
1 |
deepskyblue |
CHD |
0.589498 |
0.753117 |
P01009 |
mothers_against_decapentaplegic_homolog_4 |
0.005316 |
0.080604 |
0.056011 |
0.010512 |
0.039842 |
0.000000 |
1 |
deepskyblue |
CHD |
0.607031 |
0.740363 |
Q13485 |
vascular_endothelial_growth_factor_b |
0.000000 |
0.006129 |
0.073377 |
0.028607 |
0.121075 |
0.000000 |
4 |
green |
IHD |
0.293398 |
0.498955 |
P49765 |
indoleamine_2,3-dioxygenase_1 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.034477 |
0.000000 |
5 |
darkgoldenrod |
IHD |
0.949800 |
0.544562 |
P14902 |
thyroxine-binding_globulin |
0.000000 |
0.036206 |
0.020516 |
0.006959 |
0.048815 |
0.024457 |
4 |
green |
IHD |
0.399345 |
0.635551 |
P05543 |
smoothelin |
0.000000 |
0.010463 |
0.000000 |
0.020742 |
0.092131 |
0.013016 |
4 |
green |
IHD |
0.262940 |
0.600811 |
P53814 |
short-chain_specific_acyl-coa_dehydrogenase,_mitochondrial |
0.000000 |
0.000000 |
0.023999 |
0.000000 |
0.022102 |
0.000000 |
2 |
firebrick |
CM |
0.658054 |
0.193892 |
P16219 |
protein_tyrosine_phosphatase_type_iva_1 |
0.030680 |
0.000000 |
0.022076 |
0.000000 |
0.000000 |
0.000000 |
1 |
deepskyblue |
ARR |
0.580443 |
0.468921 |
Q93096 |
creatine_kinase_u-type,_mitochondrial |
0.007622 |
0.000000 |
0.017123 |
0.000000 |
0.341386 |
0.000000 |
4 |
green |
IHD |
0.147972 |
0.526543 |
P12532 |
c-c_chemokine_receptor_type_5 |
0.000000 |
0.000000 |
0.019980 |
0.050426 |
0.000000 |
0.000000 |
1 |
deepskyblue |
CVA |
0.501680 |
0.598933 |
P51681 |
resultdf.to_csv('average/'+linkg+ '.csv')
df0 = resultdf[resultdf['label']==0]
df0.to_csv('average/'+linkg+ '0.csv')
df1 = resultdf[resultdf['label']==1]
df1.to_csv('average/'+linkg+ '1.csv')
df2 = resultdf[resultdf['label']==2]
df2.to_csv('average/'+linkg+ '2.csv')
df3 = resultdf[resultdf['label']==3]
df3.to_csv('average/'+linkg+ '3.csv')
df4 = resultdf[resultdf['label']==4]
df4.to_csv('average/'+linkg+ '4.csv')
df5 = resultdf[resultdf['label']==5]
df5.to_csv('average/'+linkg+ '5.csv')
Seperate cluster plots
plt.figure(figsize = [22,17])
plt.subplot(2,3,1)
plt.scatter(df0['X'],df0['Y'],color=df0['color'])
plt.title('Cluster-label-0')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.subplot(2,3,2)
plt.scatter(df1['X'],df1['Y'],color=df1['color'])
plt.title('Cluster-label-1')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.subplot(2,3,3)
plt.scatter(df2['X'],df2['Y'],color=df2['color'])
plt.title('Cluster-label-2')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.subplot(2,3,4)
plt.scatter(df3['X'],df3['Y'],color=df3['color'])
plt.title('Cluster-label-3')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.subplot(2,3,5)
plt.scatter(df4['X'],df4['Y'],color=df4['color'])
plt.title('Cluster-label-4')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.subplot(2,3,6)
plt.scatter(df5['X'],df5['Y'],color=df5['color'])
plt.title('Cluster-label-5')
plt.xlabel('dimension 1', fontsize =20)
plt.ylabel('dimension 2',fontsize =20)
plt.savefig('average/'+linkg+'-subplot.png')
plt.show()
Cluster In deep
resultdf.groupby('label').count()
|
ARR |
CHD |
CM |
CVA |
IHD |
VD |
color |
nmax |
X |
Y |
uprot |
label |
|
|
|
|
|
|
|
|
|
|
|
0 |
519 |
519 |
519 |
519 |
519 |
519 |
519 |
519 |
519 |
519 |
519 |
1 |
676 |
676 |
676 |
676 |
676 |
676 |
676 |
676 |
676 |
676 |
676 |
2 |
287 |
287 |
287 |
287 |
287 |
287 |
287 |
287 |
287 |
287 |
287 |
3 |
112 |
112 |
112 |
112 |
112 |
112 |
112 |
112 |
112 |
112 |
112 |
4 |
911 |
911 |
911 |
911 |
911 |
911 |
911 |
911 |
911 |
911 |
911 |
5 |
197 |
197 |
197 |
197 |
197 |
197 |
197 |
197 |
197 |
197 |
197 |
|
ARR |
CHD |
CM |
CVA |
IHD |
VD |
label |
color |
nmax |
X |
Y |
uprot |
Protein |
|
|
|
|
|
|
|
|
|
|
|
|
sh3_and_multiple_ankyrin_repeat_domains_protein_3 |
0.00000 |
0.000000 |
0.000000 |
0.039151 |
0.000000 |
0.000000 |
0 |
mediumslateblue |
CVA |
0.654033 |
0.959608 |
Q9BYB0 |
endophilin-b1 |
0.00000 |
0.000000 |
0.000000 |
0.043899 |
0.000000 |
0.000000 |
0 |
mediumslateblue |
CVA |
0.516064 |
0.954616 |
Q9Y371 |
a_disintegrin_and_metalloproteinase_with_thrombospondin_motifs_12 |
0.00000 |
0.000000 |
0.000000 |
0.000000 |
0.062291 |
0.000000 |
0 |
mediumslateblue |
IHD |
0.230749 |
0.748474 |
P58397 |
cytokine_receptor_common_subunit_gamma |
0.00000 |
0.000000 |
0.000000 |
0.039151 |
0.000000 |
0.000000 |
0 |
mediumslateblue |
CVA |
0.661077 |
0.969435 |
P31785 |
myelin_basic_protein |
0.00000 |
0.007666 |
0.002121 |
0.349511 |
0.003877 |
0.003344 |
0 |
mediumslateblue |
CVA |
0.352625 |
0.888755 |
P02686 |
phosphate-regulating_neutral_endopeptidase |
0.00000 |
0.000000 |
0.000000 |
0.000000 |
0.062291 |
0.000000 |
0 |
mediumslateblue |
IHD |
0.226173 |
0.747243 |
P78562 |
protein_z-dependent_protease_inhibitor |
0.00000 |
0.000000 |
0.000000 |
0.065516 |
0.032020 |
0.000000 |
0 |
mediumslateblue |
CVA |
0.432626 |
0.801987 |
Q9UK55 |
ste20-like_serine/threonine-protein_kinase |
0.00000 |
0.000000 |
0.000000 |
0.040786 |
0.000000 |
0.000000 |
0 |
mediumslateblue |
CVA |
0.545912 |
0.965827 |
Q9H2G2 |
tissue-type_plasminogen_activator |
0.04285 |
0.013719 |
0.009599 |
0.330811 |
0.231620 |
0.006301 |
0 |
mediumslateblue |
CVA |
0.335407 |
0.886174 |
P00750 |
interleukin-20_receptor_subunit_alpha |
0.00000 |
0.000000 |
0.000000 |
0.039151 |
0.000000 |
0.000000 |
0 |
mediumslateblue |
CVA |
0.653468 |
0.972735 |
Q9UHF4 |
df0[0:50].drop(['X','Y','label'],axis =1).plot.barh(stacked=True,figsize=(16, 25),fontsize =15 )
<matplotlib.axes._subplots.AxesSubplot at 0x7f93a8b73358>