# !/usr/bin/env python2
# -*- coding: utf-8 -*-
""" Collected methods and classes for providers of papers, e.g.
Springer.
"""
from __future__ import print_function, division, unicode_literals
import re
from bs4 import BeautifulSoup
import requests
from mechanize import Browser
[docs]class Provider(object):
""" Class representing the providers of papers
Note:
Do not instance this class but inherit from it and overwrite
:func:`need_to_pay` and :func:`get_pdf_url`.
"""
NAME = ''
""" (str): Name of provider
"""
RE_URL = None
""" (:class:`re.RegexObject`):
Compiled regex used for matching URLs to this
provider
"""
def __repr__(self):
return self.NAME
[docs] @classmethod
def get_soup(cls, url, browser=None):
""" Get a parsed version of the HTML source of an URL
Args:
url (str):
URL pointing to desired webpage
browser (Optional[:class:`mechanize.Browser`]):
If no browser is provided, a new instance
will be created.
Returns:
(:class:`bs4.BeautifulSoup`)
Parsed version of HTML source
"""
browser = cls.get_browser(browser)
browser.open(url)
html = browser.response().read()
return BeautifulSoup(html, 'html5lib')
[docs] @classmethod
def need_to_pay(cls, url, browser=None):
""" Check whether one needs to pay for PDF download
Args:
url (str):
URL pointing to desired webpage
browser (Optional[:class:`mechanize.Browser`]):
If no browser is provided, a new instance
will be created.
"""
pass
[docs] @classmethod
def papget(cls, url, filename, browser=None):
""" Comfortably download the PDF from a given URL
Args:
url (str):
URL pointing to desired webpage
browser (Optional[:class:`mechanize.Browser`]):
If no browser is provided, a new instance
will be created.
"""
browser = cls.get_browser(browser)
if not cls.need_to_pay(url, browser):
link = cls.get_pdf_url(url, browser)
req = requests.get(link)
with open(filename, 'wb') as pdf:
pdf.write(req.content)
return filename
[docs] @classmethod
def get_pdf_url(cls, url, browser=None):
""" Get URL of PDF resource
Args:
url (str):
URL pointing to desired webpage
browser (Optional[:class:`mechanize.Browser`]):
If no browser is provided, a new instance
will be created.
"""
pass
[docs] @staticmethod
def get_browser(browser=None):
""" Create new browser if none is present.
Returns:
(:class:`mechanize.Browser`)
"""
if not browser:
browser = Browser()
browser.set_handle_robots(False)
browser.addheaders = [(
'User-agent',
('Mozilla/5.0 (X11; U; Linux i686; en-US; '
'rv:1.9.0.1) Gecko/2008071615 '
'Fedora/3.0.1-1.fc9 Firefox/3.0.1'))]
return browser
[docs]class Springer(Provider):
""" Provider implementation for Springer
Examples:
>>> url = 'https://link.springer.com/article/10.1007%2Fs40065-017-0185-1'
>>> bool(Springer.need_to_pay(url))
False
>>> Springer.get_pdf_url(url)
u'https://link.springer.com/content/pdf/...
>>> Springer.papget(url, 'temp.pdf')
u'temp.pdf'
>>> import os; os.remove('temp.pdf')
"""
NAME = 'Springer'
""" (str): Name of provider
"""
RE_URL = re.compile(r'https://link.springer.com')
""" (:class:`re.RegexObject`):
Compiled regex used for matching URLs to this
provider
"""
@classmethod
def need_to_pay(cls, url, browser=None):
soup = cls.get_soup(url, browser)
return soup.find('span', attrs={'class': 'buybox__buy'})
@classmethod
def get_pdf_url(cls, url, browser=None):
soup = cls.get_soup(url, browser)
pdf = soup.find('a',
title=('Download this book in PDF '
'format'))
if pdf:
link = pdf['href']
return 'https://link.springer.com%s' % link
pdf = soup.find('span', string='PDF')
a = pdf.parent
link = a['href']
return 'https://link.springer.com%s' % link
[docs]class Cammbridge(Provider):
""" Provider implementation for Cammbridge University Press
Examples:
>>> url = 'https://bit.ly/2KoN7vU'
>>> bool(Cammbridge.need_to_pay(url))
False
>>> Cammbridge.get_pdf_url(url)
u'https://www.cambridge.org/core/services/aop-...
>>> Cammbridge.papget(url, 'temp.pdf')
u'temp.pdf'
>>> import os; os.remove('temp.pdf')
>>> url2 = 'https://bit.ly/2HBtD9F'
>>> bool(Cammbridge.need_to_pay(url2))
True
"""
NAME = 'Cammbridge University Press'
RE_URL = re.compile('www.cambridge.org')
@classmethod
def need_to_pay(cls, url, browser=None):
soup = cls.get_soup(url, browser)
return soup.find('a', string='Get access')
@classmethod
def get_pdf_url(cls, url, browser=None):
soup = cls.get_soup(url, browser)
pdf = soup.find('a',
attrs={'aria-label': 'Download PDF'})
link = pdf['href']
return 'https://www.cambridge.org%s' % link
[docs]class Ams(Provider):
""" Provider implementation for American Mathematical
Society
Examples:
>>> url = 'https://bit.ly/2HEalfN'
>>> bool(Ams.need_to_pay(url))
False
>>> Ams.get_pdf_url(url)
u'http://www.ams.org/journals/jams/2016-29-01/...
>>> Ams.papget(url, 'temp.pdf')
u'temp.pdf'
>>> import os; os.remove('temp.pdf')
"""
NAME = 'American Mathematical Society'
RE_URL = re.compile('www.ams.org')
@classmethod
def need_to_pay(cls, url, browser=None):
soup = cls.get_soup(url, browser)
return soup.find('div',
id='buy_in_amsbookstore_div')
@classmethod
def get_pdf_url(cls, url, browser=None):
browser = cls.get_browser(browser)
soup = cls.get_soup(url, browser)
pdf = soup.find('a', string='Full-text PDF')
link = pdf['href']
browser.open(url)
browser.open(link)
return browser.geturl()
[docs]class SciHub(Provider):
""" Provider implementation for Sci-Hub
Raises:
RuntimeError: if a CAPTACHA is encountered.
"""
NAME = 'Sci-Hub'
RE_URL = re.compile('sci-hub.tw')
@classmethod
def need_to_pay(cls, url, browser=None):
return False
@classmethod
def get_pdf_url(cls, url, browser=None):
doi = re.match(r'http://dx.doi.org/(.*)', url).group(1)
scihub = 'http://sci-hub.tw/'
browser = cls.get_browser(browser)
soup = cls.get_soup(scihub + doi, browser)
pdf = soup.find('div',
attrs={'class': 'button',
'id': 'save'}
)
link = pdf.p.a['onclick']
link = re.match(r'.*?=\'(.*)\'', link).group(1)
return link
@classmethod
def papget(cls, url, filename, browser=None):
browser = cls.get_browser(browser)
if not cls.need_to_pay(url, browser):
link = cls.get_pdf_url(url, browser)
req = requests.get(link)
if 'CaptchaRedirect' in req.text:
raise RuntimeError('Captach encountered')
content = req.content
if len(content) < 3000:
msg = 'File size too small to be valid PDF: {}'
raise RuntimeError(msg.format(len(content)))
with open(filename, 'wb') as pdf:
pdf.write(req.content)
return filename
ALL_PROVIDERS = [Springer, Cammbridge, Ams]