Populate MongoDB

This notebook demonstrate how to populate MongDB Database with Pubmed documents.

import pymongo
import json
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["PubMed"]
collection = db["documents"]
db.documents.estimated_document_count()
4645413

Show database names

client.list_database_names()
['PubMed', 'admin', 'config', 'local']

To drop the database

#client.drop_database('PubMed')

Populate the Database

k =0
with open("../caseolap/data/pubmed.json", 'r')as file:
    while k< 5:
        for line in file:
            item = json.loads(line.strip())
            print(item.keys())
            k = k+1
            break       
dict_keys(['PMID', 'ArticleTitle', 'Abstract',\
'MeshHeadingList', 'Journal', 'PubDate', 'Country'])
dict_keys(['PMID', 'ArticleTitle', 'Abstract',\
'MeshHeadingList', 'Journal', 'PubDate', 'Country'])
dict_keys(['PMID', 'ArticleTitle', 'Abstract',\
'MeshHeadingList', 'Journal', 'PubDate', 'Country'])
dict_keys(['PMID', 'ArticleTitle', 'Abstract',\
'MeshHeadingList', 'Journal', 'PubDate', 'Country'])
dict_keys(['PMID', 'ArticleTitle', 'Abstract',\
'MeshHeadingList', 'Journal', 'PubDate', 'Country'])
k = 0
j = 1
selected = {}
with open("../caseolap/data/pubmed.json", 'r')as file:
        for line in file:
            data = {}
            item = json.loads(line.strip())
            #print(item.keys())
            PMID = item['PMID']

            try:
                status = selected[PMID]

            except:
                data.update({"_id": item['PMID'],
                       "title": item["ArticleTitle"],\
                       "abstract": item["Abstract"],\
                       "MeSH": item["MeshHeadingList"],\
                       "journal": item["Journal"],\
                       "date":item["PubDate"],
                       "location": item["Country"]})

            selected.update({PMID:True})

            collection.insert_one(data)

            if k%500000 == 0:
                print(j*0.5, "million documents incerted in database successfully")
                j = j+1

            k = k+1 

x = collection.find_one()
print(x) 
{'_id': '21103068', 'title': 'A Direct Comparison of the Anticancer
Activities of Digitoxin MeON-Neoglycosides and O-Glycosides: 
Oligosaccharide Chain Length-Dependent Induction of Caspase-9-Mediated
Apoptosis.', 'abstract': 'Digitoxin is a cardiac glycoside currently
being investigated for potential use in oncology. While a number of
structure-activity relationship studies have been conducted, an 
investigation of anticancer activity as a function of oligosaccharide
chain length has not yet been performed. We generated mono-, di-, 
and tri-O-digitoxoside derivatives of digitoxin and compared their
activity to the corresponding MeON-neoglycosides. Both classes of 
cardenolide derivatives display comparable oligosaccharide chain 
length-dependent cytotoxicity toward human cancer cell lines. 
Further investigation revealed that both classes of compounds 
induce caspase-9-mediated apoptosis in non-small cell lung cancer
cells (NCI-H460). Since O-glycosides and MeON-neoglycosides 
share a similar mode of action, the convenience of MeON-neoglycosylation 
could be exploited in future SAR work to rapidly survey large numbers 
of carbohydrates to prioritize selected O-glycoside candidates 
for traditional synthesis.', 'MeSH': [], 'journal': 
'ACS medicinal chemistry letters', 'date': {'Year': 
'2010', 'Month': 'Jul', 'Day': '12', 'Season': '',
'MedlineDate': ''}, 'location': 'United States'}