Clustering: Affinity Propagation

from sklearn.cluster import AffinityPropagation
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
from itertools import cycle

Get Data

data = pd.read_csv('../datareader/score/old-score.csv')
data.index = data['phrase']
ndf = data.drop('phrase',axis =1)
ndf.head()
Unnamed: 0 CVA IHD CM ARR VD CHD
phrase
1,25-dihydroxyvitamin_d(3)_24-hydroxylase,_mitochondrial 0 0.000000 0.446767 0.000000 0.000000 0.0 0.000000
1,4-alpha-glucan-branching_enzyme 1 0.290414 0.000000 1.035505 0.091431 0.0 0.372223
1,5-anhydro-d-fructose_reductase 2 0.000000 0.000000 0.000000 0.000000 0.0 2.220267
1-phosphatidylinositol_3-phosphate_5-kinase 3 0.000000 0.000000 0.000000 0.308680 0.0 0.317226
1-phosphatidylinositol_4,5-bisphosphate_phosphodiesterase_beta-1 4 0.000000 0.000000 0.472116 0.000000 0.0 0.000000
ndf.shape
(3175, 7)

Split data

tdata = ndf.copy(deep =True)
cva_ihd = tdata.drop([ 'CM', 'ARR', 'VD', 'CHD'],axis =1)
cva_cm = tdata.drop([ 'IHD', 'ARR', 'VD', 'CHD'],axis =1)
cva_arr = tdata.drop([ 'IHD', 'CM', 'VD', 'CHD'],axis =1)
cva_vd = tdata.drop([ 'IHD', 'CM', 'ARR', 'CHD'],axis =1)
cva_chd = tdata.drop(['IHD', 'CM', 'ARR', 'VD'],axis =1)
ihd_cm = tdata.drop(['CVA', 'ARR', 'VD', 'CHD'],axis =1)
ihd_arr = tdata.drop(['CVA', 'CM', 'VD', 'CHD'],axis =1)
ihd_vd = tdata.drop(['CVA', 'CM', 'ARR', 'CHD'],axis =1)
ihd_chd = tdata.drop(['CVA', 'CM', 'ARR', 'VD'],axis =1)
cm_arr = tdata.drop(['CVA', 'IHD', 'VD', 'CHD'],axis =1)
cm_vd = tdata.drop(['CVA', 'IHD', 'ARR', 'CHD'],axis =1)
cm_chd = tdata.drop(['CVA', 'IHD', 'ARR', 'VD'],axis =1)
arr_vd = tdata.drop(['CVA', 'IHD', 'CM', 'CHD'],axis =1)
arr_chd = tdata.drop(['CVA', 'IHD', 'CM', 'VD'],axis =1)
vd_chd = tdata.drop(['CVA', 'IHD', 'CM', 'ARR'],axis =1)
def remover(df,cvd1,cvd2):

    protein = df.index
    d1 = df[cvd1]
    d2 = df[cvd2]
    DT = []
    c = 0.5
    cutoff = list(df.mean()+(c*df.std()))
    for p,a,b in zip(protein,d1,d2):
        if a > cutoff[0] or  b > cutoff[1] :
            DT.append({'protein': p, cvd1: a, cvd2: b},)
    dfr = pd.DataFrame(DT)
    dfr.index = dfr['protein']
    dfr = dfr.drop('protein',axis =1)
    return dfr

Select Pair

cvd1 = 'ARR' #
cvd2 = 'CVA'
df = cva_arr
pref=-1.0
cvd1 = 'IHD' #
cvd2 = 'CVA'
df = cva_ihd
pref=-1.0
cvd1 = 'CM' #
cvd2 = 'CVA'
df = cva_cm
pref=-0.1
cvd1 = 'VD'  #
cvd2 = 'CVA'
df = cva_vd
pref=-0.1
cvd1 = 'CHD' #
cvd2 = 'CVA'
df = cva_chd
pref=-0.1
cvd1 = 'IHD'#
cvd2 = 'CM'
df = ihd_cm
pref=-0.1
cvd1 = 'IHD'#
cvd2 = 'ARR'
df = ihd_arr
pref=-0.1
cvd1 = 'IHD'#
cvd2 = 'VD'
df = ihd_vd
pref=-0.1
cvd1 = 'IHD' #
cvd2 = 'CHD'
df = ihd_chd
pref=-0.1
cvd1 = 'CM' #
cvd2 = 'ARR'
df = cm_arr
pref=-0.1
cvd1 = 'CM' #
cvd2 = 'VD'
df = cm_vd
pref=-0.1
cvd1 = 'CM'#
cvd2 = 'CHD'
df = cm_chd
pref=-0.1
cvd1 = 'ARR'#
cvd2 = 'VD'
df = arr_vd
pref=-0.1
cvd1 = 'ARR'#
cvd2 = 'CHD'
df = arr_chd
pref=-0.1
cvd1 = 'VD'#
cvd2 = 'CHD'
df = vd_chd
pref=-0.1

rdf = remover(df,cvd1,cvd2)
rdf.head()
CVA IHD
protein
1-phosphatidylinositol_4,5-bisphosphate_phosphodiesterase_delta-1 0.484608 0.000000
2',3'-cyclic-nucleotide_3'-phosphodiesterase 2.739150 0.000000
3-hydroxy-3-methylglutaryl-coenzyme_a_reductase 1.006448 3.095639
43_kda_receptor-associated_protein_of_the_synapse 1.283404 0.169499
5-hydroxytryptamine_receptor_1a 0.846001 0.484862
rdf.shape
(628, 2)
X = rdf.copy(deep =True)
p = np.array(list(X.index))

Affinity Propagation

# Compute Affinity Propagation
af = AffinityPropagation(preference = pref).fit(X)
indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters = len(indices)
n_clusters
23

Data and Plot

sns.set()
plt.figure(figsize = [15,15])
DTT =[]
X = np.array(X)
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters), colors):
    class_members = labels == k
    cluster_center = X[indices[k]]
    ln = len(X[class_members, 0])
    if ln >1:
        for i in range(ln):
            DTT.append({'label': k,\
                        'protein': p[class_members][i],\
                        cvd1: X[class_members, 0][i],\
                        cvd2: X[class_members, 1][i],\
                        'color':col})
    else:

        DTT.append({'label': k,\
                        'protein': p[class_members][0],\
                        cvd1: X[class_members, 0][0],\
                        cvd2: X[class_members, 1][0],\
                        'color':col})

    # plot data
    plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
    # plot center
    plt.plot(cluster_center[0], cluster_center[1], '.', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    # draw lines
    for x in X[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

plt.title('Estimated number of clusters: %d' % n_clusters)
plt.xlabel(cvd1, fontsize = 25)
plt.ylabel(cvd2, fontsize = 25 )
plt.savefig('plot/'+cvd1+'-'+cvd2+'.pdf')
plt.show()

png


listdf = pd.DataFrame(DTT)
listdf.index = listdf['protein']
listdf = listdf.drop('protein',axis =1)
listdf.head(5)
ARR CVA color label
protein
1-phosphatidylinositol_4,5-bisphosphate_phosphodiesterase_delta-1 0.000000 0.484608 b 0
5-hydroxytryptamine_receptor_2a 0.174764 0.555104 b 0
78_kda_glucose-regulated_protein 0.065296 0.529532 b 0
achaete-scute_homolog_1 0.000000 0.449759 b 0
adenylyl_cyclase-associated_protein_2 0.000000 0.449759 b 0

Add Uniprot

data = pd.read_csv('uniprot.csv')
data.index = data['protein']
udf = data.drop('protein',axis =1)
udf.head()
uniprot_id gene_names has_multiple_IDs
protein
natriuretic_peptides_b P16860 NPPB NaN
angiotensin-converting_enzyme P12821 ACE DCP DCP1 NaN
potassium_voltage-gated_channel_subfamily_h_member_2 Q12809 KCNH2 ERG ERG1 HERG NaN
c-reactive_protein P02741 CRP PTX1 NaN
apolipoprotein_e P02649 APOE NaN
idx_cvd = list(listdf.index)
idx_uprt =  list(udf.index)

prot_dict = []
for item in idx_cvd:
    data = listdf.loc[item,:]
    if item in idx_uprt:
         prot_dict.append({'Protein':item,\
                     cvd1:data[0],\
                     cvd2:data[1],\
                     'color':data[2],\
                     'label': data[3],\
                     'uprot':udf.loc[item,:][0]})

    else:
        print(item,'Match not Found')
        prot_dict.append({'Protein':item,\
                     cvd1:data[0],\
                     cvd2:data[1],\
                     'color':data[2],\
                     'label': data[3],\
                     'uprot':'NAN'})
resultdf = pd.DataFrame(prot_dict)
resultdf.index = resultdf['Protein']
resultdf = resultdf.drop('Protein',axis =1)
resultdf.head(10)
CVA IHD color label uprot
Protein
brain-derived_neurotrophic_factor 0.492909 0.026065 b 0 P23560
alpha-galactosidase_a 0.488128 0.003105 b 0 P06280
neurotrophin-3 0.510094 0.025944 b 0 P20783
tetraspanin-33 0.686945 0.003340 b 0 Q86UF1
glutamate_receptor_ionotropic,_nmda_2c 0.479266 0.051119 b 0 Q14957
nestin 0.404329 0.040345 b 0 P48681
glutamate_receptor_ionotropic,_nmda_2b 0.480174 0.050999 b 0 Q13224
glutamate_receptor_ionotropic,_nmda_1 0.479685 0.051063 b 0 Q05586
microtubule-associated_protein_2 0.520331 0.000000 b 0 P11137
glial_fibrillary_acidic_protein 0.568521 0.003560 b 0 P14136
resultdf.to_csv('data/' + cvd1 + '-' + cvd2 + '.csv')