Source code for pubmed_parser.pubmed_web_parser

import sys
import re
import time
import requests
from lxml import etree
from lxml import html
from unidecode import unidecode

try:
    from urllib.request import urlopen
except ImportError:
    from urllib2 import urlopen
from .utils import stringify_children

__all__ = ["parse_xml_web", "parse_citation_web", "parse_outgoing_citation_web"]


def load_xml(pmid, sleep=None):
    """
    Load XML file from given pmid from eutils site
    return a dictionary for given pmid and xml string from the site

    Parameters
    ----------
    pmid: (int, str)
        String of integer of a PMID

    sleep: int
        how much time we want to wait until requesting new xml
        default: None

    Return
    ------
    tree: Element
        An eutils XML of a given PMID
    """
    link = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id={}".format(
        pmid
    )
    page = requests.get(link)
    tree = html.fromstring(page.content)
    if sleep is not None:
        time.sleep(sleep)
    return tree


def parse_pubmed_web_tree(tree):
    """
    Giving a tree Element from eutils, return parsed dictionary from the tree

    Parameters
    ----------
    tree: Element
        An lxml Element parsed from eutil website

    Return
    ------
    dict_out: dict
        A parsed output in dictionary format, dictionary keys includes 
        'title', 'abstract', 'journal', 'affliation' (string of affiliation with ';' separated),
        'authors' (string with ';' separated),
        'keywords' (keywords and MeSH terms from an XML -- if MeSH term it will be 'MeSH descriptor':'MeSH name')
        'doi', 'year' 
    """
    if len(tree.xpath("//articletitle")) != 0:
        title = " ".join([title.text for title in tree.xpath("//articletitle")])
    elif len(tree.xpath("//booktitle")) != 0:
        title = " ".join([title.text for title in tree.xpath("//booktitle")])
    else:
        title = ""

    abstract_tree = tree.xpath("//abstract/abstracttext")
    abstract = " ".join([stringify_children(a).strip() for a in abstract_tree])

    if len(tree.xpath("//article//title")) != 0:
        journal = ";".join([t.text.strip() for t in tree.xpath("//article//title")])
    else:
        journal = ""

    pubdate = tree.xpath('//pubmeddata//history//pubmedpubdate[@pubstatus="medline"]')
    pubdatebook = tree.xpath(
        '//pubmedbookdata//history//pubmedpubdate[@pubstatus="medline"]'
    )
    if len(pubdate) >= 1 and pubdate[0].find("year") is not None:
        year = pubdate[0].find("year").text
    elif len(pubdatebook) >= 1 and pubdatebook[0].find("year") is not None:
        year = pubdatebook[0].find("year").text
    else:
        year = ""

    affiliations = list()
    if tree.xpath("//affiliationinfo/affiliation") is not None:
        for affil in tree.xpath("//affiliationinfo/affiliation"):
            affiliations.append(affil.text)
    affiliations_text = "; ".join(affiliations)

    authors_tree = tree.xpath("//authorlist/author")
    authors = list()
    if authors_tree is not None:
        for a in authors_tree:
            firstname = (
                a.find("forename").text if a.find("forename") is not None else ""
            )
            lastname = a.find("lastname").text if a.find("forename") is not None else ""
            fullname = (firstname + " " + lastname).strip()
            if fullname == "":
                fullname = (
                    a.find("collectivename").text
                    if a.find("collectivename") is not None
                    else ""
                )
            authors.append(fullname)
        authors_text = "; ".join(authors)
    else:
        authors_text = ""

    keywords = ""
    keywords_mesh = tree.xpath("//meshheadinglist//meshheading")
    keywords_book = tree.xpath("//keywordlist//keyword")
    if len(keywords_mesh) > 0:
        mesh_terms_list = []
        for m in keywords_mesh:
            keyword = (
                m.find("descriptorname").attrib.get("ui", "")
                + ":"
                + m.find("descriptorname").text
            )
            mesh_terms_list.append(keyword)
        keywords = ";".join(mesh_terms_list)
    elif len(keywords_book) > 0:
        keywords = ";".join([m.text or "" for m in keywords_book])
    else:
        keywords = ""

    doi = tree.xpath('//elocationid[@eidtype="doi"]')
    try:
        doi = doi[0].text
    except IndexError:
        doi = None

    pii = tree.xpath('//elocationid[@eidtype="pii"]')
    try:
        pii = pii[0].text
    except IndexError:
        pii = None

    language = tree.xpath("//language")
    try:
        language = language[0].text
    except IndexError:
        language = None

    dict_out = {
        "title": title,
        "abstract": abstract,
        "journal": journal,
        "affiliation": affiliations_text,
        "authors": authors_text,
        "keywords": keywords,
        "doi": doi,
        "pii": pii,
        "year": year,
        "language": language
    }
    return dict_out


[docs]def parse_xml_web(pmid, sleep=None, save_xml=False):
    """
    Give an input PMID, load and parse XML using PubMed eutils

    Parameters
    ----------
    pmid: str
        A string of PMID which you want to parse from eutils    
    sleep: int
        An integer of how long you want to wait after parsing one PMID from eutils
        default: None
    save_xml: bool
        if it is True, save an XML output as a string in the key ``xml`` in an output dictionary.
        It is good to check the information in 
        if it is False, we won't save a full XML to an output
        default: False

    Return
    ------
    dict_out: dict
        A dictionary contains information of parsed XML from a given PMID

    Examples
    --------
    >>> pubmed_parser.parse_xml_web(11360989, sleep=1, save_xml=False)
    {
        'title': 'Molecular biology and evolution. Can genes explain biological complexity?',
        'abstract': '',
        'journal': 'Science (New York, N.Y.)',
        'affiliation': 'Collegium Budapest (Institute for Advanced Study), 2 Szentháromság u., H-1014 Budapest, Hungary. szathmary@colbud.hu',
        'authors': 'E Szathmáry; F Jordán; C Pál',
        'keywords': 'D000818:Animals;D005075:Biological Evolution;...',
        'doi': '10.1126/science.1060852',
        'year': '2001',
        'pmid': '11360989'
    }
    """
    tree = load_xml(pmid, sleep=sleep)
    dict_out = parse_pubmed_web_tree(tree)
    dict_out["pmid"] = str(pmid)
    if save_xml:
        dict_out["xml"] = etree.tostring(tree)
    return dict_out


def extract_citations(tree):
    """
    Extract number of citations from a given eutils XML tree.

    Parameters
    ----------
    tree: Element
        An lxml Element parsed from eutil website

    Return
    ------
    n_citations: int
        Number of citations that an article get until parsed date. If no citations found, return 0
    """
    citations_text = tree.xpath('//form/h2[@class="head"]/text()')[0]
    n_citations = re.sub("Is Cited by the Following ", "", citations_text).split(" ")[0]
    try:
        n_citations = int(n_citations)
    except:
        n_citations = 0
    return n_citations


def extract_pmc(citation):
    """
    Extract PMC from a given eutils XML tree.

    Parameters
    ----------
    tree: Element
        An lxml Element parsed from eutil website

    Return
    ------
    pmc: str
        PubMed Central ID (PMC) of an article
    """
    pmc_text = [c for c in citation.split("/") if c != ""][-1]
    pmc = re.sub("PMC", "", pmc_text)
    return pmc


def convert_document_id(doc_id, id_type="PMC"):
    """
    Convert a given document id to dictionary of other id.
    Please see http://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/ for more info

    Parameters
    ----------
    doc_id: (int, str)
        A string or integer of document ID
    id_type: str
        A document ID type corresponding to an input ``doc_id``
        default: 'PMC'
        options: 'PMID', 'DOI', or 'OTHER'

    Return
    ------
    output_dict: dict
        A dictionary contains possible mapping of a given document ID including 'pmc', 'pmid', and 'doi'.
        If the document ID cannot be found, this will return empty string instead

    Examples
    --------
    >>> pubmed_parser.pubmed_web_parser.convert_document_id(6933944, id_type='PMC')
    {'pmc': 'PMC6933944', 'pmid': '31624211', 'doi': '10.1126/science.aax1562'}
    """
    doc_id = str(doc_id)
    if id_type == "PMC":
        doc_id = "PMC{}".format(doc_id)
        pmc = doc_id
        convert_link = "http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=my_tool&email=my_email@example.com&ids={}".format(
            doc_id
        )
    elif id_type in ["PMID", "DOI", "OTHER"]:
        convert_link = "http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=my_tool&email=my_email@example.com&ids={}".format(
            doc_id
        )
    else:
        raise ValueError("Give id_type from PMC or PMID or DOI or OTHER")

    convert_page = requests.get(convert_link)
    convert_tree = html.fromstring(convert_page.content)
    record = convert_tree.find("record").attrib
    if "status" in record or "pmcid" not in record:
        raise ValueError("Cannot convert given document id to PMC")
    if id_type in ["PMID", "DOI", "OTHER"]:
        if "pmcid" in record:
            pmc = record["pmcid"]
        else:
            pmc = ""
    pmid = record["pmid"] if "pmid" in record else ""
    doi = record["doi"] if "doi" in record else ""
    return {"pmc": pmc, "pmid": pmid, "doi": doi}


[docs]def parse_citation_web(doc_id, id_type="PMC"):
    """
    Parse citations from given document id

    Parameters
    ----------
    doc_id: (str, int)
        document id
    id_type: str
        corresponding type of doc_id. This can be a choice from the following ['PMC', 'PMID', 'DOI', 'OTHER']

    Return
    ------
    dict_out: dict
        output is a dictionary contains following keys
        'pmc' (Pubmed Central ID), 'pmid' (Pubmed ID), 
        'doi' (DOI of an article),  'n_citations' (number of citations for given articles),
        'pmc_cited' (list of PMCs that cite the given PMC)

    Examples
    --------
    >>> pubmed_parser.parse_citation_web(6933944, id_type='PMC')
    {
        'n_citations': 0,
        'pmid': '31624211',
        'pmc': '6933944',
        'doi': '10.1126/science.aax1562',
        'pmc_cited': []
    }
    """
    assert id_type in ["PMC", "PMID", "DOI", "OTHER"]

    doc_id_dict = convert_document_id(doc_id, id_type=id_type)
    pmc = doc_id_dict["pmc"]
    link = "http://www.ncbi.nlm.nih.gov/pmc/articles/{}/citedby/".format(pmc)
    page = requests.get(link)
    tree = html.fromstring(page.content)
    n_citations = extract_citations(tree)
    n_pages = int(n_citations / 30) + 1

    pmc_cited_all = list()  # all PMC cited
    citations = tree.xpath('//div[@class="rprt"]/div[@class="title"]/a/@href')[1::]
    pmc_cited = list(map(extract_pmc, citations))
    pmc_cited_all.extend(pmc_cited)
    if n_pages >= 2:
        for i in range(2, n_pages + 1):
            link = "http://www.ncbi.nlm.nih.gov/pmc/articles/{}/citedby/?page={}".format(
                pmc, i
            )
            page = requests.get(link)
            tree = html.fromstring(page.content)
            citations = tree.xpath('//div[@class="rprt"]/div[@class="title"]/a/@href')[
                1::
            ]
            pmc_cited = list(map(extract_pmc, citations))
            pmc_cited_all.extend(pmc_cited)
    pmc_cited_all = [p for p in pmc_cited_all if p is not pmc]
    dict_out = {
        "n_citations": n_citations,
        "pmid": doc_id_dict["pmid"],
        "pmc": re.sub("PMC", "", doc_id_dict["pmc"]),
        "doi": doc_id_dict["doi"],
        "pmc_cited": pmc_cited_all,
    }
    return dict_out


[docs]def parse_outgoing_citation_web(doc_id, id_type="PMC"):
    """
    A function to load citations from NCBI eutils API for a given document

    Example URL:
    https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pmc&linkname=pmc_refs_pubmed&id=221212

    Parameters
    ----------
    doc_id: str
        The document ID
    id_type: str
        A type of provided document ID, can be either 'PMC' or 'PMID'

    Return
    ------
    dict_out: dict
        a dictionary containing the following keys 'n_citations' (number of citations for that article),
        'doc_id' (the document ID number), 'id_type' (the type of document ID provided (PMCID or PMID)),
        'pmid_cited' (a list of papers cited by the document as PMIDs)

    >>> pubmed_parser.parse_outgoing_citation_web(6933944, id_type='PMC')
    {
        'n_citations': 11,
        'doc_id': '6933944',
        'id_type': 'PMC',
        'pmid_cited': ['30705152', ..., ]
    }
    """
    doc_id = str(doc_id)
    if id_type == "PMC":
        db = "pmc"
        linkname = "pmc_refs_pubmed"
    elif id_type == "PMID":
        db = "pubmed"
        linkname = "pubmed_pubmed_refs"
    else:
        raise ValueError("Unsupported id_type `{}`".format(id_type))
    link = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom={}&linkname={}&id={}".format(
        db, linkname, doc_id
    )

    parser = etree.XMLParser()
    with urlopen(link) as f:
        tree = etree.parse(f, parser)
    pmid_cited_all = tree.xpath("/eLinkResult/LinkSet/LinkSetDb/Link/Id/text()")
    n_citations = len(pmid_cited_all)
    if not n_citations:  # If there are no citations, likely a bad doc_id
        return None
    dict_out = {
        "n_citations": n_citations,
        "doc_id": doc_id,
        "id_type": id_type,
        "pmid_cited": pmid_cited_all,
    }
    return dict_out