This pipeline will parse the extracted data and convert it into data structures compatible with the CaseOLAP pipeline.
Installation of required python package : Install and import lxml, itertools
and json
libraries into the current python environment.
import re
import itertools
import json
import sys
import os
import time
import traceback
from lxml import etree
DATA_DIR = './'
MeSH statistics dictionary
mesh_statistics = {}
Data parsing strategy : The extracted data is an XML
file, and text data is embedded in the tree structure of XML
document. The following are steps for parsing data:
lxml
module to dig into the tree structure of XML
document. PMID, authors, abstract
, MeSH
etc. of the data is obtained by using tags representing these components. lxml
.# Search the tag in the xml element
# Return tag's text if tag exists, return empty string if doesn't
def get_text(element, tag):
e = element.find(tag)
if e is not None:
return e.text
else:
return ''
# <!ELEMENT AuthorList (Author+) >
# <!ELEMENT Author (((LastName, ForeName?, Initials?, Suffix?)\
# | CollectiveName), Identifier*, AffiliationInfo*) >
def parse_author(authors):
result = []
for author in authors:
item = {}
item['LastName'] = get_text(author, 'LastName')
item['ForeName'] = get_text(author, 'ForeName')
item['Initials'] = get_text(author, 'Initials')
item['Suffix'] = get_text(author, 'Suffix')
item['CollectiveName'] = get_text(author, 'CollectiveName')
result.append(item)
return result
Creation of dictionary of parsed data : A python dictionary is created with all the components as key-value pair. This JSON-like data structure makes it compatible for indexing and searching in Elasticsearch which is described in step 3 of protocol.
Creation of MeSH to PMID mapping: During the creation of dictionary of parsed data, MeSH to PMID mapping table can also be created. This mapping is used to create Text-cube(in step 4) document structure as an requirement of CaseOLAP algorithm.
def parse_pubmed_file(file, pubmed_output_file, pmid2mesh_output_file):
print('Start parsing %s' % file)
sys.stdout.flush()
t1 = time.time()
f = open(file, 'r')
tree = etree.parse(f)
articles = itertools.chain(tree.findall('PubmedArticle'), tree.findall('BookDocument'))
count = 0
noabs = 0
for article in articles:
count += 1
result = {}
pmid2mesh = {}
# PMID - Exactly One Occurrance
result['PMID'] = get_text(article, './/PMID')
pmid2mesh['PMID'] = get_text(article, './/PMID')
# # Article title - Zero or One Occurrences
# result['ArticleTitle'] = get_text(article, './/ArticleTitle')
# Abstract - Zero or One Occurrences
abstractList = article.find('.//Abstract')
if abstractList != None:
try:
abstract = '\n'.join([line.text for line in abstractList.\
findall('AbstractText')])
result['Abstract'] = abstract
except:
result['Abstract'] = ''
noabs += 1
else:
result['Abstract'] = ''
noabs += 1
# # Author List - Zero or More Occurrences
# authors = article.findall('.//Author')
# result['AuthorList'] = parse_author(authors)
# # Journal - Exactly One Occurrance
# journal = article.find('.//Journal')
# result['Journal'] = get_text(journal, 'Title')
result['PubDate'] = {}
result['PubDate']['Year'] = get_text(journal, 'JournalIssue/PubDate/Year')
# result['PubDate']['Month'] = get_text(journal, 'JournalIssue/PubDate/Month')
# result['PubDate']['Day'] = get_text(journal, 'JournalIssue/PubDate/Day')
# result['PubDate']['Season'] = get_text(journal, 'JournalIssue/PubDate/Season')
# result['PubDate']['MedlineDate'] = get_text(journal,\
# 'JournalIssue/PubDate/MedlineDate')
# MeshHeading - Zero or More Occurrences
headings = article.findall('.//MeshHeading')
result['MeshHeadingList'] = []
pmid2mesh['MeshHeadingList'] = []
if headings:
for heading in headings:
descriptor_names = heading.findall('DescriptorName')
qualifier_names = heading.findall('QualifierName')
if descriptor_names:
for descriptor_name in descriptor_names:
result['MeshHeadingList'].append(descriptor_name.text)
pmid2mesh['MeshHeadingList'].append(descriptor_name.text)
if qualifier_names:
for qualifier_name in qualifier_names:
result['MeshHeadingList'].append(qualifier_name.text)
pmid2mesh['MeshHeadingList'].append(qualifier_name.text)
mesh_count = len(result['MeshHeadingList'])
if mesh_count in mesh_statistics:
mesh_statistics[mesh_count] += 1
else:
mesh_statistics[mesh_count] = 1
# Dump to pubmed json file <----------------------------
json.dump(result, pubmed_output_file)
pubmed_output_file.write('\n')
# Dump to pmid2mesh json file <-------------------------
json.dump(pmid2mesh, pmid2mesh_output_file)
pmid2mesh_output_file.write('\n')
print('Finish parsing %s, totally %d articles parsed. Total time: %fs'\
% (file, count, time.time() - t1))
print('%d acticles no abstracts' % (noabs))
sys.stdout.flush()
f.close()
Setting up directories in parsing loop
def parse_dir(source_dir, pubmed_output_file,pmid2mesh_output_file):
if os.path.isdir(source_dir):
for file in os.listdir(source_dir):
if re.search(r'^pubmed18n\d\d\d\d.xml$', file) is not None:
try:
parse_pubmed_file(os.path.join(source_dir, file),\
pubmed_output_file, pmid2mesh_output_file)
except:
print("XXXX Unexpected error happended when parsing %s XXXX" % file)
print(traceback.print_exc())
sys.stdout.flush()
Run the Parsing Pipeline
t1 = time.time()
pubmed_output_file_path = os.path.join(DATA_DIR, 'data/pubmed.json')
pmid2mesh_output_file_path = os.path.join(DATA_DIR, 'pmid2mesh/pmid2mesh_from_parsing.json')
pubmed_output_file = open(pubmed_output_file_path, 'w')
pmid2mesh_output_file = open(pmid2mesh_output_file_path, 'w')
parse_dir(os.path.join(DATA_DIR, 'ftp.ncbi.nlm.nih.gov/pubmed/baseline'),\
pubmed_output_file, pmid2mesh_output_file)
parse_dir(os.path.join(DATA_DIR, 'ftp.ncbi.nlm.nih.gov/pubmed/updatefiles'),\
pubmed_output_file, pmid2mesh_output_file)
pubmed_output_file.close()
pmid2mesh_output_file.close()
mesh_file = open(os.path.join(DATA_DIR, 'data/mesh_statistics.json'), 'w')
json.dump(mesh_statistics, mesh_file)
mesh_file.close()
print("==== Parsing finished, results dumped to %s ====" % pubmed_output_file_path)
print("==== TOTAL TIME: %fs ====" % (time.time() - t1))
inputFilePath = "data/pubmed.json"
meshFilePath = "mesh2pmid/"
mesh2pmid_output_file = open(meshFilePath + 'mesh2pmid.json', "w")
mesh2pmid = dict()
with open(inputFilePath, "r") as fin:
start = time.time()
k = 0
for line in fin: ## each line is single document
try:
k = k+1
paperInfo = json.loads(line.strip())
data_dict = {}
# update PMID
data_dict["pmid"] = paperInfo.get("PMID", "-1")
#update MeSH Heading <----------------------
data_dict["mesh_heading"] = " ".join(paperInfo["MeshHeadingList"])
# collect Mesh2PMID <-------------------
if data_dict["pmid"] != "-1":
for mesh in paperInfo["MeshHeadingList"]:
if mesh not in mesh2pmid:
mesh2pmid[mesh] = []
mesh2pmid[mesh].append(data_dict["pmid"])
if k%500000 == 0:
print(k,'done!')
#break
except:
print("XXXX Unexpected Error happened at line: XXXX")
# Dumping rest papers
for key,value in mesh2pmid.items():
json.dump({key:value}, mesh2pmid_output_file)
mesh2pmid_output_file.write('\n')
mesh2pmid = dict()
end = time.time()
print("Finish Total escaped time %s (seconds) " % (end - start) )