Source code for pubmed_parser.pubmed_web_parser

import sys
import re
import time
import requests
from lxml import etree
from lxml import html
from unidecode import unidecode

try:
    from urllib.request import urlopen
except ImportError:
    from urllib2 import urlopen
from .utils import stringify_children

__all__ = ["parse_xml_web", "parse_citation_web", "parse_outgoing_citation_web"]


def load_xml(pmid, sleep=None):
    """
    Load XML file from given pmid from eutils site
    return a dictionary for given pmid and xml string from the site

    Parameters
    ----------
    pmid: (int, str)
        String of integer of a PMID

    sleep: int
        how much time we want to wait until requesting new xml
        default: None

    Return
    ------
    tree: Element
        An eutils XML of a given PMID
    """
    link = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id={}".format(
        pmid
    )
    page = requests.get(link)
    tree = html.fromstring(page.content)
    if sleep is not None:
        time.sleep(sleep)
    return tree


def parse_pubmed_web_tree(tree):
    """
    Giving a tree Element from eutils, return parsed dictionary from the tree

    Parameters
    ----------
    tree: Element
        An lxml Element parsed from eutil website

    Return
    ------
    dict_out: dict
        A parsed output in dictionary format, dictionary keys includes 
        'title', 'abstract', 'journal', 'affliation' (string of affiliation with ';' separated),
        'authors' (string with ';' separated),
        'keywords' (keywords and MeSH terms from an XML -- if MeSH term it will be 'MeSH descriptor':'MeSH name')
        'doi', 'year' 
    """
    if len(tree.xpath("//articletitle")) != 0:
        title = " ".join([title.text for title in tree.xpath("//articletitle")])
    elif len(tree.xpath("//booktitle")) != 0:
        title = " ".join([title.text for title in tree.xpath("//booktitle")])
    else:
        title = ""

    abstract_tree = tree.xpath("//abstract/abstracttext")
    abstract = " ".join([stringify_children(a).strip() for a in abstract_tree])

    if len(tree.xpath("//article//title")) != 0:
        journal = ";".join([t.text.strip() for t in tree.xpath("//article//title")])
    else:
        journal = ""

    pubdate = tree.xpath('//pubmeddata//history//pubmedpubdate[@pubstatus="medline"]')
    pubdatebook = tree.xpath(
        '//pubmedbookdata//history//pubmedpubdate[@pubstatus="medline"]'
    )
    if len(pubdate) >= 1 and pubdate[0].find("year") is not None:
        year = pubdate[0].find("year").text
    elif len(pubdatebook) >= 1 and pubdatebook[0].find("year") is not None:
        year = pubdatebook[0].find("year").text
    else:
        year = ""

    affiliations = list()
    if tree.xpath("//affiliationinfo/affiliation") is not None:
        for affil in tree.xpath("//affiliationinfo/affiliation"):
            affiliations.append(affil.text)
    affiliations_text = "; ".join(affiliations)

    authors_tree = tree.xpath("//authorlist/author")
    authors = list()
    if authors_tree is not None:
        for a in authors_tree:
            firstname = (
                a.find("forename").text if a.find("forename") is not None else ""
            )
            lastname = a.find("lastname").text if a.find("forename") is not None else ""
            fullname = (firstname + " " + lastname).strip()
            if fullname == "":
                fullname = (
                    a.find("collectivename").text
                    if a.find("collectivename") is not None
                    else ""
                )
            authors.append(fullname)
        authors_text = "; ".join(authors)
    else:
        authors_text = ""

    keywords = ""
    keywords_mesh = tree.xpath("//meshheadinglist//meshheading")
    keywords_book = tree.xpath("//keywordlist//keyword")
    if len(keywords_mesh) > 0:
        mesh_terms_list = []
        for m in keywords_mesh:
            keyword = (
                m.find("descriptorname").attrib.get("ui", "")
                + ":"
                + m.find("descriptorname").text
            )
            mesh_terms_list.append(keyword)
        keywords = ";".join(mesh_terms_list)
    elif len(keywords_book) > 0:
        keywords = ";".join([m.text or "" for m in keywords_book])
    else:
        keywords = ""

    doi = tree.xpath('//elocationid[@eidtype="doi"]')
    try:
        doi = doi[0].text
    except IndexError:
        doi = None

    pii = tree.xpath('//elocationid[@eidtype="pii"]')
    try:
        pii = pii[0].text
    except IndexError:
        pii = None

    language = tree.xpath("//language")
    try:
        language = language[0].text
    except IndexError:
        language = None

    dict_out = {
        "title": title,
        "abstract": abstract,
        "journal": journal,
        "affiliation": affiliations_text,
        "authors": authors_text,
        "keywords": keywords,
        "doi": doi,
        "pii": pii,
        "year": year,
        "language": language
    }
    return dict_out


[docs]def parse_xml_web(pmid, sleep=None, save_xml=False): """ Give an input PMID, load and parse XML using PubMed eutils Parameters ---------- pmid: str A string of PMID which you want to parse from eutils sleep: int An integer of how long you want to wait after parsing one PMID from eutils default: None save_xml: bool if it is True, save an XML output as a string in the key ``xml`` in an output dictionary. It is good to check the information in if it is False, we won't save a full XML to an output default: False Return ------ dict_out: dict A dictionary contains information of parsed XML from a given PMID Examples -------- >>> pubmed_parser.parse_xml_web(11360989, sleep=1, save_xml=False) { 'title': 'Molecular biology and evolution. Can genes explain biological complexity?', 'abstract': '', 'journal': 'Science (New York, N.Y.)', 'affiliation': 'Collegium Budapest (Institute for Advanced Study), 2 Szentháromság u., H-1014 Budapest, Hungary. szathmary@colbud.hu', 'authors': 'E Szathmáry; F Jordán; C Pál', 'keywords': 'D000818:Animals;D005075:Biological Evolution;...', 'doi': '10.1126/science.1060852', 'year': '2001', 'pmid': '11360989' } """ tree = load_xml(pmid, sleep=sleep) dict_out = parse_pubmed_web_tree(tree) dict_out["pmid"] = str(pmid) if save_xml: dict_out["xml"] = etree.tostring(tree) return dict_out
def extract_citations(tree): """ Extract number of citations from a given eutils XML tree. Parameters ---------- tree: Element An lxml Element parsed from eutil website Return ------ n_citations: int Number of citations that an article get until parsed date. If no citations found, return 0 """ citations_text = tree.xpath('//form/h2[@class="head"]/text()')[0] n_citations = re.sub("Is Cited by the Following ", "", citations_text).split(" ")[0] try: n_citations = int(n_citations) except: n_citations = 0 return n_citations def extract_pmc(citation): """ Extract PMC from a given eutils XML tree. Parameters ---------- tree: Element An lxml Element parsed from eutil website Return ------ pmc: str PubMed Central ID (PMC) of an article """ pmc_text = [c for c in citation.split("/") if c != ""][-1] pmc = re.sub("PMC", "", pmc_text) return pmc def convert_document_id(doc_id, id_type="PMC"): """ Convert a given document id to dictionary of other id. Please see http://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/ for more info Parameters ---------- doc_id: (int, str) A string or integer of document ID id_type: str A document ID type corresponding to an input ``doc_id`` default: 'PMC' options: 'PMID', 'DOI', or 'OTHER' Return ------ output_dict: dict A dictionary contains possible mapping of a given document ID including 'pmc', 'pmid', and 'doi'. If the document ID cannot be found, this will return empty string instead Examples -------- >>> pubmed_parser.pubmed_web_parser.convert_document_id(6933944, id_type='PMC') {'pmc': 'PMC6933944', 'pmid': '31624211', 'doi': '10.1126/science.aax1562'} """ doc_id = str(doc_id) if id_type == "PMC": doc_id = "PMC{}".format(doc_id) pmc = doc_id convert_link = "http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=my_tool&email=my_email@example.com&ids={}".format( doc_id ) elif id_type in ["PMID", "DOI", "OTHER"]: convert_link = "http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=my_tool&email=my_email@example.com&ids={}".format( doc_id ) else: raise ValueError("Give id_type from PMC or PMID or DOI or OTHER") convert_page = requests.get(convert_link) convert_tree = html.fromstring(convert_page.content) record = convert_tree.find("record").attrib if "status" in record or "pmcid" not in record: raise ValueError("Cannot convert given document id to PMC") if id_type in ["PMID", "DOI", "OTHER"]: if "pmcid" in record: pmc = record["pmcid"] else: pmc = "" pmid = record["pmid"] if "pmid" in record else "" doi = record["doi"] if "doi" in record else "" return {"pmc": pmc, "pmid": pmid, "doi": doi}
[docs]def parse_citation_web(doc_id, id_type="PMC"): """ Parse citations from given document id Parameters ---------- doc_id: (str, int) document id id_type: str corresponding type of doc_id. This can be a choice from the following ['PMC', 'PMID', 'DOI', 'OTHER'] Return ------ dict_out: dict output is a dictionary contains following keys 'pmc' (Pubmed Central ID), 'pmid' (Pubmed ID), 'doi' (DOI of an article), 'n_citations' (number of citations for given articles), 'pmc_cited' (list of PMCs that cite the given PMC) Examples -------- >>> pubmed_parser.parse_citation_web(6933944, id_type='PMC') { 'n_citations': 0, 'pmid': '31624211', 'pmc': '6933944', 'doi': '10.1126/science.aax1562', 'pmc_cited': [] } """ assert id_type in ["PMC", "PMID", "DOI", "OTHER"] doc_id_dict = convert_document_id(doc_id, id_type=id_type) pmc = doc_id_dict["pmc"] link = "http://www.ncbi.nlm.nih.gov/pmc/articles/{}/citedby/".format(pmc) page = requests.get(link) tree = html.fromstring(page.content) n_citations = extract_citations(tree) n_pages = int(n_citations / 30) + 1 pmc_cited_all = list() # all PMC cited citations = tree.xpath('//div[@class="rprt"]/div[@class="title"]/a/@href')[1::] pmc_cited = list(map(extract_pmc, citations)) pmc_cited_all.extend(pmc_cited) if n_pages >= 2: for i in range(2, n_pages + 1): link = "http://www.ncbi.nlm.nih.gov/pmc/articles/{}/citedby/?page={}".format( pmc, i ) page = requests.get(link) tree = html.fromstring(page.content) citations = tree.xpath('//div[@class="rprt"]/div[@class="title"]/a/@href')[ 1:: ] pmc_cited = list(map(extract_pmc, citations)) pmc_cited_all.extend(pmc_cited) pmc_cited_all = [p for p in pmc_cited_all if p is not pmc] dict_out = { "n_citations": n_citations, "pmid": doc_id_dict["pmid"], "pmc": re.sub("PMC", "", doc_id_dict["pmc"]), "doi": doc_id_dict["doi"], "pmc_cited": pmc_cited_all, } return dict_out
[docs]def parse_outgoing_citation_web(doc_id, id_type="PMC"): """ A function to load citations from NCBI eutils API for a given document Example URL: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pmc&linkname=pmc_refs_pubmed&id=221212 Parameters ---------- doc_id: str The document ID id_type: str A type of provided document ID, can be either 'PMC' or 'PMID' Return ------ dict_out: dict a dictionary containing the following keys 'n_citations' (number of citations for that article), 'doc_id' (the document ID number), 'id_type' (the type of document ID provided (PMCID or PMID)), 'pmid_cited' (a list of papers cited by the document as PMIDs) >>> pubmed_parser.parse_outgoing_citation_web(6933944, id_type='PMC') { 'n_citations': 11, 'doc_id': '6933944', 'id_type': 'PMC', 'pmid_cited': ['30705152', ..., ] } """ doc_id = str(doc_id) if id_type == "PMC": db = "pmc" linkname = "pmc_refs_pubmed" elif id_type == "PMID": db = "pubmed" linkname = "pubmed_pubmed_refs" else: raise ValueError("Unsupported id_type `{}`".format(id_type)) link = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom={}&linkname={}&id={}".format( db, linkname, doc_id ) parser = etree.XMLParser() with urlopen(link) as f: tree = etree.parse(f, parser) pmid_cited_all = tree.xpath("/eLinkResult/LinkSet/LinkSetDb/Link/Id/text()") n_citations = len(pmid_cited_all) if not n_citations: # If there are no citations, likely a bad doc_id return None dict_out = { "n_citations": n_citations, "doc_id": doc_id, "id_type": id_type, "pmid_cited": pmid_cited_all, } return dict_out