Parsing Drugs Data

This notebook Explains how to parse Drug Data obtained from Drub Bank. The DrugBank database is a unique bioinformatics and cheminformatics resource that combines detailed drug data with comprehensive drug target information. Source

import re
import itertools
import json
import sys
import os
import time
import traceback
from lxml import etree
def get_text(element, tag):
    e = element.find(tag)
    if e is not None:
        return e.text
    else:
        return ''
database = open("./data/fulldatabase.xml", 'r')
import xml.etree.ElementTree as ET
tree = ET.parse(database)
root = tree.getroot()
k = 0
f = open("alldrugs.txt", 'w')
Data = []
sdata = []
name = None
for drug in root:
    k = k +1
    name = drug.find("{http://www.drugbank.ca}name")
    if name is not None:
        d_name = name.text  
        line  = name.text

    state = drug.find("{http://www.drugbank.ca}state")
    if state is not None:
        d_state = state.text

    description = drug.find("{http://www.drugbank.ca}description")
    if description is not None:
        d_description = description.text

    indication =  drug.find("{http://www.drugbank.ca}indication")
    if indication is not None:
        d_indication = indication.text

    #---------dosages --------
    dosages = drug.find("{http://www.drugbank.ca}dosages")
    D = []
    for dosage in dosages:
        d = {}
        for item,n in zip(dosage,["from","route","strength"]):
            d.update({n:item.text})
        D.append(d)     

    #-----------------Targets -----
    targets = drug.find("{http://www.drugbank.ca}targets")
    T = []
    for t in targets:
        T.append(t.text)

    #----------pathways ------
    pathways = drug.find("{http://www.drugbank.ca}pathways")
    P = []
    for t in pathways:
        P.append(t.text)

    #----------synonyms ----------
    synonyms = drug.find("{http://www.drugbank.ca}synonyms")
    S = []
    for t in synonyms:
        S.append(t.text)
        if len(t.text) > 3:
            line = line + "|" + t.text

    sdata.append({"name":d_name,\
                 "synonyms":S})

    Data.append({"name":d_name,\
                "description":d_description,\
                "state": d_state,\
                "indication": d_indication,\
                "dosages": D,\
                "synonyms":S})
    f.write(line)
    f.write("\n")
import json as json
with open("Drugs.json", "w") as f:
    json.dump(Data,f)
with open("syn.json", "w") as f:
    json.dump(Data,f)
import pandas as pd
DF = pd.DataFrame(sdata)
DF.head()
name synonyms
0 Lepirudin [Hirudin variant-1, Lepirudin recombinant]
1 Cetuximab [Cetuximab, CĂ©tuximab, Cetuximabum, Immunoglob...
2 Dornase alfa [Deoxyribonuclease (human clone 18-1 protein m...
3 Denileukin diftitox [Denileukin, Interleukin-2/diptheria toxin fus...
4 Etanercept [Etanercept-szzs, RHU TNFR:FC, RHU-TNFR:FC, TN...
DF.to_csv("syn.csv")
DF.shape
(11922, 2)
import pandas as pd
import json as json

with open("Drugs.json",'r')as ff:
    Data = json.load(ff)
import pandas as pd
DF = pd.DataFrame(Data)
DF.head()
description dosages indication name state synonyms
0 Lepirudin is identical to natural hirudin exce... [{'from': 'Injection, solution, concentrate', ... For the treatment of heparin-induced thrombocy... Lepirudin liquid [Hirudin variant-1, Lepirudin recombinant]
1 Cetuximab is an epidermal growth factor recept... [{'from': 'Injection, solution', 'route': 'Int... Cetuximab, used in combination with irinotecan... Cetuximab liquid [Cetuximab, CĂ©tuximab, Cetuximabum, Immunoglob...
2 Dornase alfa is a biosynthetic form of human d... [{'from': 'Solution', 'route': 'Respiratory (i... Used as adjunct therapy in the treatment of cy... Dornase alfa liquid [Deoxyribonuclease (human clone 18-1 protein m...
3 A recombinant DNA-derived cytotoxic protein co... [{'from': 'Injection, solution', 'route': 'Int... For treatment of cutaneous T-cell lymphoma Denileukin diftitox liquid [Denileukin, Interleukin-2/diptheria toxin fus...
4 Dimeric fusion protein consisting of the extra... [{'from': 'Injection, powder, for solution', '... Etanercept is indicated for the treatment of m... Etanercept liquid [Etanercept-szzs, RHU TNFR:FC, RHU-TNFR:FC, TN...
DF = DF.set_index("name")
DF.to_csv("Drugsall.csv")