Source code for papget.papget

# !/usr/bin/env python2
# -*- coding: utf-8 -*-
""" Collected methods and classes for providers of papers, e.g.
Springer.
"""

from __future__ import print_function, division, unicode_literals

import re

from bs4 import BeautifulSoup
import requests
from mechanize import Browser

[docs]class Provider(object):
    """ Class representing the providers of papers

    Note:
        Do not instance this class but inherit from it and overwrite
    :func:`need_to_pay` and :func:`get_pdf_url`.
    """
    NAME = ''
    """ (str): Name of provider
    """
    RE_URL = None
    """ (:class:`re.RegexObject`):
            Compiled regex used for matching URLs to this
            provider
    """
    def __repr__(self):
        return self.NAME

[docs]    @classmethod
    def get_soup(cls, url, browser=None):
        """ Get a parsed version of the HTML source of an URL

        Args:
            url (str):
                    URL pointing to desired webpage
            browser (Optional[:class:`mechanize.Browser`]):
                    If no browser is provided, a new instance
                    will be created.

        Returns:
            (:class:`bs4.BeautifulSoup`)
                Parsed version of HTML source
        """
        browser = cls.get_browser(browser)
        browser.open(url)
        html = browser.response().read()
        return BeautifulSoup(html, 'html5lib')

[docs]    @classmethod
    def need_to_pay(cls, url, browser=None):
        """ Check whether one needs to pay for PDF download

        Args:
            url (str):
                    URL pointing to desired webpage
            browser (Optional[:class:`mechanize.Browser`]):
                    If no browser is provided, a new instance
                    will be created.
        """
        pass

[docs]    @classmethod
    def papget(cls, url, filename, browser=None):
        """ Comfortably download the PDF from a given URL

        Args:
            url (str):
                    URL pointing to desired webpage
            browser (Optional[:class:`mechanize.Browser`]):
                    If no browser is provided, a new instance
                    will be created.
        """
        browser = cls.get_browser(browser)
        if not cls.need_to_pay(url, browser):
            link = cls.get_pdf_url(url, browser)
            req = requests.get(link)
            with open(filename, 'wb') as pdf:
                pdf.write(req.content)
            return filename

[docs]    @classmethod
    def get_pdf_url(cls, url, browser=None):
        """ Get URL of PDF resource

        Args:
            url (str):
                    URL pointing to desired webpage
            browser (Optional[:class:`mechanize.Browser`]):
                    If no browser is provided, a new instance
                    will be created.
        """
        pass

[docs]    @staticmethod
    def get_browser(browser=None):
        """ Create new browser if none is present.

        Returns:
            (:class:`mechanize.Browser`)
        """
        if not browser:
            browser = Browser()
            browser.set_handle_robots(False)
            browser.addheaders = [(
                'User-agent',
                ('Mozilla/5.0 (X11; U; Linux i686; en-US; '
                'rv:1.9.0.1) Gecko/2008071615 '
                'Fedora/3.0.1-1.fc9 Firefox/3.0.1'))]

        return browser

[docs]class Springer(Provider):
    """ Provider implementation for Springer

    Examples:
        >>> url = 'https://link.springer.com/article/10.1007%2Fs40065-017-0185-1'
        >>> bool(Springer.need_to_pay(url))
        False
        >>> Springer.get_pdf_url(url)
        u'https://link.springer.com/content/pdf/...
        >>> Springer.papget(url, 'temp.pdf')
        u'temp.pdf'
        >>> import os; os.remove('temp.pdf')
    """
    NAME = 'Springer'
    """ (str): Name of provider
    """
    RE_URL = re.compile(r'https://link.springer.com')
    """ (:class:`re.RegexObject`):
            Compiled regex used for matching URLs to this
            provider
    """

    @classmethod
    def need_to_pay(cls, url, browser=None):
        soup = cls.get_soup(url, browser)
        return soup.find('span', attrs={'class': 'buybox__buy'})

    @classmethod
    def get_pdf_url(cls, url, browser=None):
        soup = cls.get_soup(url, browser)
        pdf = soup.find('a',
                        title=('Download this book in PDF '
                        'format'))
        if pdf:
            link = pdf['href']
            return 'https://link.springer.com%s' % link
        pdf = soup.find('span', string='PDF')
        a = pdf.parent
        link = a['href']
        return 'https://link.springer.com%s' % link



[docs]class Cammbridge(Provider):
    """ Provider implementation for Cammbridge University Press

    Examples:
        >>> url = 'https://bit.ly/2KoN7vU'
        >>> bool(Cammbridge.need_to_pay(url))
        False
        >>> Cammbridge.get_pdf_url(url)
        u'https://www.cambridge.org/core/services/aop-...
        >>> Cammbridge.papget(url, 'temp.pdf')
        u'temp.pdf'
        >>> import os; os.remove('temp.pdf')
        >>> url2 = 'https://bit.ly/2HBtD9F'
        >>> bool(Cammbridge.need_to_pay(url2))
        True
    """
    NAME = 'Cammbridge University Press'
    RE_URL = re.compile('www.cambridge.org')

    @classmethod
    def need_to_pay(cls, url, browser=None):
        soup = cls.get_soup(url, browser)
        return soup.find('a', string='Get access')

    @classmethod
    def get_pdf_url(cls, url, browser=None):
        soup = cls.get_soup(url, browser)
        pdf = soup.find('a',
                        attrs={'aria-label': 'Download PDF'})
        link = pdf['href']
        return 'https://www.cambridge.org%s' % link

[docs]class Ams(Provider):
    """ Provider implementation for American Mathematical
    Society

    Examples:
        >>> url = 'https://bit.ly/2HEalfN'
        >>> bool(Ams.need_to_pay(url))
        False
        >>> Ams.get_pdf_url(url)
        u'http://www.ams.org/journals/jams/2016-29-01/...
        >>> Ams.papget(url, 'temp.pdf')
        u'temp.pdf'
        >>> import os; os.remove('temp.pdf')
    """
    NAME = 'American Mathematical Society'
    RE_URL = re.compile('www.ams.org')

    @classmethod
    def need_to_pay(cls, url, browser=None):
        soup = cls.get_soup(url, browser)
        return soup.find('div',
                         id='buy_in_amsbookstore_div')

    @classmethod
    def get_pdf_url(cls, url, browser=None):
        browser = cls.get_browser(browser)
        soup = cls.get_soup(url, browser)
        pdf = soup.find('a', string='Full-text PDF')
        link = pdf['href']
        browser.open(url)
        browser.open(link)
        return browser.geturl()

[docs]class SciHub(Provider):
    """ Provider implementation for Sci-Hub

    Raises:
        RuntimeError: if a CAPTACHA is encountered.
    """
    NAME = 'Sci-Hub'
    RE_URL = re.compile('sci-hub.tw')

    @classmethod
    def need_to_pay(cls, url, browser=None):
        return False

    @classmethod
    def get_pdf_url(cls, url, browser=None):
        doi = re.match(r'http://dx.doi.org/(.*)', url).group(1)
        scihub = 'http://sci-hub.tw/'
        browser = cls.get_browser(browser)
        soup = cls.get_soup(scihub + doi, browser)
        pdf = soup.find('div',
                        attrs={'class': 'button',
                               'id': 'save'}
                       )

        link = pdf.p.a['onclick']
        link = re.match(r'.*?=\'(.*)\'', link).group(1)
        return link

    @classmethod
    def papget(cls, url, filename, browser=None):
        browser = cls.get_browser(browser)
        if not cls.need_to_pay(url, browser):
            link = cls.get_pdf_url(url, browser)
            req = requests.get(link)
            if 'CaptchaRedirect' in req.text:
                raise RuntimeError('Captach encountered')
            content = req.content
            if len(content) < 3000:
                msg = 'File size too small to be valid PDF: {}'
                raise RuntimeError(msg.format(len(content)))

            with open(filename, 'wb') as pdf:
                pdf.write(req.content)
            return filename

ALL_PROVIDERS = [Springer, Cammbridge, Ams]