From 42343996fa55ba0b19251783a43cf2902825b497 Mon Sep 17 00:00:00 2001 From: Addison Schiller Date: Mon, 2 Oct 2017 12:15:07 -0400 Subject: [PATCH 1/7] `Public_file` query param and office365 renderer Adding support for a `public_file` query param so the OSF can request a public renderer. Added office365 which is a public renderer. This uses office online to do .docx file conversions. --- mfr/core/exceptions.py | 19 ++++++++ mfr/core/provider.py | 5 +- mfr/core/utils.py | 13 +++++ mfr/extensions/office365/README.md | 20 ++++++++ mfr/extensions/office365/__init__.py | 1 + mfr/extensions/office365/render.py | 36 ++++++++++++++ mfr/extensions/office365/settings.py | 6 +++ .../office365/templates/viewer.mako | 11 +++++ mfr/providers/osf/provider.py | 41 ++++++++++++---- setup.py | 3 ++ tests/extensions/office365/__init__.py | 0 tests/extensions/office365/test_renderer.py | 47 +++++++++++++++++++ 12 files changed, 192 insertions(+), 10 deletions(-) create mode 100644 mfr/extensions/office365/README.md create mode 100644 mfr/extensions/office365/__init__.py create mode 100644 mfr/extensions/office365/render.py create mode 100644 mfr/extensions/office365/settings.py create mode 100644 mfr/extensions/office365/templates/viewer.mako create mode 100644 tests/extensions/office365/__init__.py create mode 100644 tests/extensions/office365/test_renderer.py diff --git a/mfr/core/exceptions.py b/mfr/core/exceptions.py index 8f3573b54..9f7d5c8dc 100644 --- a/mfr/core/exceptions.py +++ b/mfr/core/exceptions.py @@ -145,6 +145,25 @@ def __init__(self, message, *args, metadata_url: str='', response: str='', **kwa 'response': self.response }]) + +class QueryParameterError(ProviderError): + """The MFR related errors raised from a :class:`mfr.core.provider` and relating to query parameters + should inherit from MetadataError + This error is thrown when a query parameter is used missused + """ + + __TYPE = 'query_parameter' + + def __init__(self, message, *args, url: str='', code: int=400, **kwargs): + super().__init__(message, code=code, *args, **kwargs) + self.url = url + self.return_code = code + self.attr_stack.append([self.__TYPE, { + 'url': self.url, + 'returncode': self.return_code, + }]) + + class TooBigToRenderError(ProviderError): """If the user tries to render a file larger than a server specified maximum, throw a TooBigToRenderError. diff --git a/mfr/core/provider.py b/mfr/core/provider.py index dac7c0f62..d173c6c68 100644 --- a/mfr/core/provider.py +++ b/mfr/core/provider.py @@ -49,18 +49,21 @@ def download(self): class ProviderMetadata: - def __init__(self, name, ext, content_type, unique_key, download_url, stable_id=None): + def __init__(self, name, ext, content_type, unique_key, + download_url, is_public=False, stable_id=None): self.name = name self.ext = ext self.content_type = content_type self.unique_key = unique_key self.download_url = download_url self.stable_id = stable_id + self.is_public = is_public def serialize(self): return { 'name': self.name, 'ext': self.ext, + 'is_public': self.is_public, 'content_type': self.content_type, 'unique_key': str(self.unique_key), 'download_url': str(self.download_url), diff --git a/mfr/core/utils.py b/mfr/core/utils.py index 52ae1c2b8..97dbb0f4b 100644 --- a/mfr/core/utils.py +++ b/mfr/core/utils.py @@ -78,6 +78,19 @@ def make_renderer(name, metadata, file_path, url, assets_url, export_url): :rtype: :class:`mfr.core.extension.BaseRenderer` """ normalized_name = (name and name.lower()) or 'none' + if metadata.is_public: + try: + return driver.DriverManager( + namespace='mfr.public_renderers', + name=normalized_name, + invoke_on_load=True, + invoke_args=(metadata, file_path, url, assets_url, export_url), + ).driver + except: + # Check for a public renderer, if one doesn't exist, use a regular one + # Real exceptions handled by main driver.DriverManager + pass + try: return driver.DriverManager( namespace='mfr.renderers', diff --git a/mfr/extensions/office365/README.md b/mfr/extensions/office365/README.md new file mode 100644 index 000000000..a43ca9cc4 --- /dev/null +++ b/mfr/extensions/office365/README.md @@ -0,0 +1,20 @@ + +# Office 365 Renderer + + +This renderer uses Office Online to render .docx files for us. If the Office Online URL ever changes, it will also need to be changed here in settings. + +Currently there is no OSF side component for these changes. Once there is, this specific note can be removed. In the meantime in order to test this renderer, you need to go to your local OSF copy of this file: https://github.com/CenterForOpenScience/osf.io/blob/develop/addons/base/views.py#L728-L736 +and add 'public_file' : 1, to the dict. This will send all files as public files. + +Testing this renderer locally is hard. Since Office Online needs access to the files it will not work with private files or ones hosted locally. To see what the docx files will render like, replace the render function with something that looks like this: + +``` + def render(self): + static_url = 'https://files.osf.io/v1/resources//providers/osfstorage/' + url = settings.OFFICE_BASE_URL + download_url.url + return self.TEMPLATE.render(base=self.assets_url, url=url) + +``` + +The file at `static_url` must be publicly available. diff --git a/mfr/extensions/office365/__init__.py b/mfr/extensions/office365/__init__.py new file mode 100644 index 000000000..08833dba1 --- /dev/null +++ b/mfr/extensions/office365/__init__.py @@ -0,0 +1 @@ +from .render import Office365Renderer # noqa diff --git a/mfr/extensions/office365/render.py b/mfr/extensions/office365/render.py new file mode 100644 index 000000000..2760ce761 --- /dev/null +++ b/mfr/extensions/office365/render.py @@ -0,0 +1,36 @@ +import os +import furl + +from mfr.core import extension +from mako.lookup import TemplateLookup +from mfr.extensions.office365 import settings + + +class Office365Renderer(extension.BaseRenderer): + """A renderer for use with public .docx files. + + Office online can render .docx files to pdf for us. + This renderer will only ever be made if a query param with `public_file=1` is sent. + It then generates and embeds an office online url into an + iframe and returns the template. The file it is trying to render MUST + be available publically online. This renderer will not work if testing locally. + + """ + + TEMPLATE = TemplateLookup( + directories=[ + os.path.join(os.path.dirname(__file__), 'templates') + ]).get_template('viewer.mako') + + def render(self): + download_url = furl.furl(self.metadata.download_url).set(query='') + url = settings.OFFICE_BASE_URL + download_url.url + return self.TEMPLATE.render(base=self.assets_url, url=url) + + @property + def file_required(self): + return False + + @property + def cache_result(self): + return False diff --git a/mfr/extensions/office365/settings.py b/mfr/extensions/office365/settings.py new file mode 100644 index 000000000..c92ba78e4 --- /dev/null +++ b/mfr/extensions/office365/settings.py @@ -0,0 +1,6 @@ +from mfr import settings + + +config = settings.child('OFFICE365_EXTENSION_CONFIG') + +OFFICE_BASE_URL = 'https://view.officeapps.live.com/op/embed.aspx?src=' diff --git a/mfr/extensions/office365/templates/viewer.mako b/mfr/extensions/office365/templates/viewer.mako new file mode 100644 index 000000000..cfc2840dc --- /dev/null +++ b/mfr/extensions/office365/templates/viewer.mako @@ -0,0 +1,11 @@ + + + + + + diff --git a/mfr/providers/osf/provider.py b/mfr/providers/osf/provider.py index 8fa52b6a4..de8fa6fd2 100644 --- a/mfr/providers/osf/provider.py +++ b/mfr/providers/osf/provider.py @@ -97,14 +97,16 @@ async def metadata(self): self.metrics.add('metadata.raw', metadata) # e.g., - # metadata = {'data': { - # 'name': 'blah.png', - # 'contentType': 'image/png', - # 'etag': 'ABCD123456...', - # 'extra': { - # ... - # }, - # }} + # metadata = { + # 'data': { + # 'name': 'blah.png', + # 'contentType': 'image/png', + # 'etag': 'ABCD123456...', + # 'extra': { + # ... + # }, + # } + # } name, ext = os.path.splitext(metadata['data']['name']) size = metadata['data']['size'] @@ -128,8 +130,29 @@ async def metadata(self): stable_str = '/{}/{}{}'.format(meta['resource'], meta['provider'], meta['path']) stable_id = hashlib.sha256(stable_str.encode('utf-8')).hexdigest() logger.debug('stable_identifier: str({}) hash({})'.format(stable_str, stable_id)) + is_public = False + + if 'public_file' in cleaned_url.args: + if cleaned_url.args['public_file'] not in ['0', '1']: + raise exceptions.QueryParameterError( + 'The `public_file` query paramter should either `0`, `1`, or unused. Instead ' + 'got {}'.format(cleaned_url.args['public_file']), + url=download_url, + provider=self.NAME, + code=400, + ) - return provider.ProviderMetadata(name, ext, content_type, unique_key, download_url, stable_id) + is_public = cleaned_url.args['public_file'] == '1' + + return provider.ProviderMetadata( + name, + ext, + content_type, + unique_key, + download_url, + stable_id, + is_public=is_public + ) async def download(self): """Download file from WaterButler, returning stream.""" diff --git a/setup.py b/setup.py index ecfe3aebd..08931112a 100755 --- a/setup.py +++ b/setup.py @@ -43,6 +43,9 @@ def parse_requirements(requirements): 'http = mfr.providers.http:HttpProvider', 'osf = mfr.providers.osf:OsfProvider', ], + 'mfr.public_renderers': [ + '.docx = mfr.extensions.office365:Office365Renderer', + ], 'mfr.exporters': [ # google docs '.gdraw = mfr.extensions.image:ImageExporter', diff --git a/tests/extensions/office365/__init__.py b/tests/extensions/office365/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/extensions/office365/test_renderer.py b/tests/extensions/office365/test_renderer.py new file mode 100644 index 000000000..ae485a125 --- /dev/null +++ b/tests/extensions/office365/test_renderer.py @@ -0,0 +1,47 @@ +import furl +import pytest + +from mfr.extensions.office365 import settings +from mfr.core.provider import ProviderMetadata +from mfr.extensions.office365 import Office365Renderer + + +@pytest.fixture +def metadata(): + return ProviderMetadata('test', '.pdf', 'text/plain', '1234', + 'http://wb.osf.io/file/test.pdf?token=1234&public_file=1', + is_public=True) + + +@pytest.fixture +def file_path(): + return '/tmp/test.docx' + + +@pytest.fixture +def url(): + return 'http://osf.io/file/test.pdf' + + +@pytest.fixture +def assets_url(): + return 'http://mfr.osf.io/assets' + + +@pytest.fixture +def export_url(): + return 'http://mfr.osf.io/export?url=' + url() + + +@pytest.fixture +def renderer(metadata, file_path, url, assets_url, export_url): + return Office365Renderer(metadata, file_path, url, assets_url, export_url) + + +class TestOffice365Renderer: + + def test_render_pdf(self, renderer, metadata, assets_url): + download_url = furl.furl(metadata.download_url).set(query='') + body_url = settings.OFFICE_BASE_URL + download_url.url + body = renderer.render() + assert ''.format(body_url) in body From 32bf62ce2d9a9e2beabf3f9d78aac06a9935a726 Mon Sep 17 00:00:00 2001 From: longze chen Date: Wed, 6 Dec 2017 13:50:09 -0500 Subject: [PATCH 2/7] Encode download url, fix tests and style updates. --- mfr/core/exceptions.py | 5 +- mfr/core/provider.py | 16 +++--- mfr/core/utils.py | 8 +-- mfr/extensions/office365/render.py | 26 +++++----- mfr/providers/osf/provider.py | 55 ++++++++++----------- tests/extensions/office365/test_renderer.py | 21 +++++--- tests/server/handlers/test_query_params.py | 2 +- 7 files changed, 72 insertions(+), 61 deletions(-) diff --git a/mfr/core/exceptions.py b/mfr/core/exceptions.py index 9f7d5c8dc..317399568 100644 --- a/mfr/core/exceptions.py +++ b/mfr/core/exceptions.py @@ -147,9 +147,8 @@ def __init__(self, message, *args, metadata_url: str='', response: str='', **kwa class QueryParameterError(ProviderError): - """The MFR related errors raised from a :class:`mfr.core.provider` and relating to query parameters - should inherit from MetadataError - This error is thrown when a query parameter is used missused + """The MFR related errors raised from a :class:`mfr.core.provider`and relating to query + parameters. This error is thrown when the query has an invalid value. """ __TYPE = 'query_parameter' diff --git a/mfr/core/provider.py b/mfr/core/provider.py index d173c6c68..66856de01 100644 --- a/mfr/core/provider.py +++ b/mfr/core/provider.py @@ -1,11 +1,12 @@ import abc -import markupsafe +from aiohttp import HttpBadRequest import furl +import markupsafe -from mfr.core import exceptions -from mfr.server import settings +from mfr.core import exceptions as core_exceptions from mfr.core.metrics import MetricsRecord +from mfr.server import settings as server_settings class BaseProvider(metaclass=abc.ABCMeta): @@ -17,12 +18,13 @@ class BaseProvider(metaclass=abc.ABCMeta): def __init__(self, request, url, action=None): self.request = request url_netloc = furl.furl(url).netloc - if url_netloc not in settings.ALLOWED_PROVIDER_NETLOCS: - raise exceptions.ProviderError( + if url_netloc not in server_settings.ALLOWED_PROVIDER_NETLOCS: + raise core_exceptions.ProviderError( message="{} is not a permitted provider domain.".format( markupsafe.escape(url_netloc) ), - code=400 + # TODO: using HTTPStatus.BAD_REQUEST fails tests, not sure why and I will take a another look later + code=HttpBadRequest.code ) self.url = url self.action = action @@ -63,9 +65,9 @@ def serialize(self): return { 'name': self.name, 'ext': self.ext, - 'is_public': self.is_public, 'content_type': self.content_type, 'unique_key': str(self.unique_key), 'download_url': str(self.download_url), 'stable_id': None if self.stable_id is None else str(self.stable_id), + 'is_public': self.is_public, } diff --git a/mfr/core/utils.py b/mfr/core/utils.py index 97dbb0f4b..30190cc13 100644 --- a/mfr/core/utils.py +++ b/mfr/core/utils.py @@ -80,6 +80,7 @@ def make_renderer(name, metadata, file_path, url, assets_url, export_url): normalized_name = (name and name.lower()) or 'none' if metadata.is_public: try: + # Use the public renderer if exist return driver.DriverManager( namespace='mfr.public_renderers', name=normalized_name, @@ -87,18 +88,19 @@ def make_renderer(name, metadata, file_path, url, assets_url, export_url): invoke_args=(metadata, file_path, url, assets_url, export_url), ).driver except: - # Check for a public renderer, if one doesn't exist, use a regular one - # Real exceptions handled by main driver.DriverManager + # If public render does not exist, use default renderer by MFR + # If public render exists but exceptions occurs, delay the exception handling pass try: + # Use the default MFR handler return driver.DriverManager( namespace='mfr.renderers', name=normalized_name, invoke_on_load=True, invoke_args=(metadata, file_path, url, assets_url, export_url), ).driver - except RuntimeError: + except: raise exceptions.MakeRendererError( namespace='mfr.renderers', name=normalized_name, diff --git a/mfr/extensions/office365/render.py b/mfr/extensions/office365/render.py index 2760ce761..2c03cb3e1 100644 --- a/mfr/extensions/office365/render.py +++ b/mfr/extensions/office365/render.py @@ -1,19 +1,22 @@ import os +from urllib import parse + import furl +from mako.lookup import TemplateLookup from mfr.core import extension -from mako.lookup import TemplateLookup -from mfr.extensions.office365 import settings +from mfr.extensions.office365 import settings as office365_settings class Office365Renderer(extension.BaseRenderer): - """A renderer for use with public .docx files. + """A renderer for .docx files that are publicly available. + + Office online can render `.docx` files to `.pdf` for us. This renderer will only be made + if a query param with `public_file=true` is present. It then generates and embeds an + office online URL into an `iframe` and returns the template. The file it is trying to + render MUST be public. - Office online can render .docx files to pdf for us. - This renderer will only ever be made if a query param with `public_file=1` is sent. - It then generates and embeds an office online url into an - iframe and returns the template. The file it is trying to render MUST - be available publically online. This renderer will not work if testing locally. + Note: this renderer DOES NOT work locally. """ @@ -23,9 +26,10 @@ class Office365Renderer(extension.BaseRenderer): ]).get_template('viewer.mako') def render(self): - download_url = furl.furl(self.metadata.download_url).set(query='') - url = settings.OFFICE_BASE_URL + download_url.url - return self.TEMPLATE.render(base=self.assets_url, url=url) + download_url = furl.furl(self.metadata.download_url).set(query='').url + encoded_download_url = parse.quote(download_url) + office_render_url = office365_settings.OFFICE_BASE_URL + encoded_download_url + return self.TEMPLATE.render(base=self.assets_url, url=office_render_url) @property def file_required(self): diff --git a/mfr/providers/osf/provider.py b/mfr/providers/osf/provider.py index de8fa6fd2..c3ea3866d 100644 --- a/mfr/providers/osf/provider.py +++ b/mfr/providers/osf/provider.py @@ -1,9 +1,10 @@ -import os import json import hashlib +from http import HTTPStatus import logging -from urllib.parse import urlparse import mimetypes +import os +from urllib.parse import urlparse import furl import aiohttp @@ -11,12 +12,10 @@ from waterbutler.core import streams -from mfr.core import exceptions -from mfr.core import provider -from mfr.core.utils import sizeof_fmt -from mfr.providers.osf import settings -from mfr.settings import MAX_FILE_SIZE_TO_RENDER -from mfr.core.exceptions import TooBigToRenderError +from mfr import settings as mfr_settings +from mfr.core import exceptions as core_exceptions +from mfr.core import provider, utils +from mfr.providers.osf import settings as provider_settings logger = logging.getLogger(__name__) @@ -79,14 +78,14 @@ async def metadata(self): response_reason = metadata_response.reason response_headers = metadata_response.headers await metadata_response.release() - if response_code != 200: - raise exceptions.MetadataError( + if response_code != HTTPStatus.OK: + raise core_exceptions.MetadataError( 'Failed to fetch file metadata from WaterButler. Received response: ', 'code {} {}'.format(str(response_code), str(response_reason)), metadata_url=download_url, response=response_reason, provider=self.NAME, - code=400 + code=HTTPStatus.BAD_REQUEST ) try: @@ -111,12 +110,12 @@ async def metadata(self): name, ext = os.path.splitext(metadata['data']['name']) size = metadata['data']['size'] - max_file_size = MAX_FILE_SIZE_TO_RENDER.get(ext) + max_file_size = mfr_settings.MAX_FILE_SIZE_TO_RENDER.get(ext) if max_file_size and size and int(size) > max_file_size: - raise TooBigToRenderError( + raise core_exceptions.TooBigToRenderError( "This file with extension '{ext}' exceeds the size limit of {max_size} and will not " "be rendered. To view this file download it and view it " - "offline.".format(ext=ext, max_size=sizeof_fmt(max_file_size)), + "offline.".format(ext=ext, max_size=utils.sizeof_fmt(max_file_size)), requested_size=int(size), maximum_size=max_file_size, ) @@ -131,18 +130,16 @@ async def metadata(self): stable_id = hashlib.sha256(stable_str.encode('utf-8')).hexdigest() logger.debug('stable_identifier: str({}) hash({})'.format(stable_str, stable_id)) is_public = False - - if 'public_file' in cleaned_url.args: - if cleaned_url.args['public_file'] not in ['0', '1']: - raise exceptions.QueryParameterError( - 'The `public_file` query paramter should either `0`, `1`, or unused. Instead ' - 'got {}'.format(cleaned_url.args['public_file']), + public_file = cleaned_url.args.get('public_file', None) + if public_file: + if public_file not in ['True', 'False']: + raise core_exceptions.QueryParameterError( + 'Invalid value for query parameter `public_file`: {}'.format(cleaned_url.args['public_file']), url=download_url, provider=self.NAME, - code=400, + code=HTTPStatus.BAD_REQUEST, ) - - is_public = cleaned_url.args['public_file'] == '1' + is_public = public_file == 'True' return provider.ProviderMetadata( name, @@ -157,13 +154,13 @@ async def metadata(self): async def download(self): """Download file from WaterButler, returning stream.""" download_url = await self._fetch_download_url() - headers = {settings.MFR_IDENTIFYING_HEADER: '1'} + headers = {provider_settings.MFR_IDENTIFYING_HEADER: '1'} response = await self._make_request('GET', download_url, allow_redirects=False, headers=headers) - if response.status >= 400: + if response.status >= HTTPStatus.BAD_REQUEST: resp_text = await response.text() logger.error('Unable to download file: ({}) {}'.format(response.status, resp_text)) - raise exceptions.DownloadError( + raise core_exceptions.DownloadError( 'Unable to download the requested file, please try again later.', download_url=download_url, response=resp_text, @@ -171,7 +168,7 @@ async def download(self): ) self.metrics.add('download.saw_redirect', False) - if response.status in (302, 301): + if response.status in (HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND): await response.release() response = await aiohttp.request('GET', response.headers['location']) self.metrics.add('download.saw_redirect', True) @@ -204,8 +201,8 @@ async def _fetch_download_url(self): await request.release() logger.debug('osf-download-resolver: request.status::{}'.format(request.status)) - if request.status != 302: - raise exceptions.MetadataError( + if request.status != HTTPStatus.FOUND: + raise core_exceptions.MetadataError( request.reason, metadata_url=self.url, provider=self.NAME, diff --git a/tests/extensions/office365/test_renderer.py b/tests/extensions/office365/test_renderer.py index ae485a125..5d90d3990 100644 --- a/tests/extensions/office365/test_renderer.py +++ b/tests/extensions/office365/test_renderer.py @@ -1,16 +1,23 @@ +from urllib import parse + import furl import pytest -from mfr.extensions.office365 import settings from mfr.core.provider import ProviderMetadata from mfr.extensions.office365 import Office365Renderer +from mfr.extensions.office365 import settings as office365_settings @pytest.fixture def metadata(): - return ProviderMetadata('test', '.pdf', 'text/plain', '1234', + return ProviderMetadata( + 'test', + '.pdf', + 'text/plain', + '1234', 'http://wb.osf.io/file/test.pdf?token=1234&public_file=1', - is_public=True) + is_public=True + ) @pytest.fixture @@ -20,7 +27,7 @@ def file_path(): @pytest.fixture def url(): - return 'http://osf.io/file/test.pdf' + return parse.quote('http://osf.io/file/test.pdf') @pytest.fixture @@ -40,8 +47,8 @@ def renderer(metadata, file_path, url, assets_url, export_url): class TestOffice365Renderer: - def test_render_pdf(self, renderer, metadata, assets_url): + def test_render_pdf(self, renderer, metadata): download_url = furl.furl(metadata.download_url).set(query='') - body_url = settings.OFFICE_BASE_URL + download_url.url + office_render_url = office365_settings.OFFICE_BASE_URL + parse.quote(download_url.url) body = renderer.render() - assert ''.format(body_url) in body + assert ''.format(office_render_url) in body diff --git a/tests/server/handlers/test_query_params.py b/tests/server/handlers/test_query_params.py index b707035dc..001c42b27 100644 --- a/tests/server/handlers/test_query_params.py +++ b/tests/server/handlers/test_query_params.py @@ -7,7 +7,7 @@ from tests import utils -class TestRenderHandler(utils.HandlerTestCase): +class TestQueryParamsHandler(utils.HandlerTestCase): @testing.gen_test def test_format_url(self): From a1f889fbf1975df592f1ed29b65481bd28cd4e9e Mon Sep 17 00:00:00 2001 From: Josh Bird Date: Fri, 3 Aug 2018 15:48:21 -0400 Subject: [PATCH 3/7] Tidy up imports, small formatting Cleans up some imports that don't need to be qualified, or explicitly imports whatever resources the files need. --- mfr/core/exceptions.py | 11 +++--- mfr/core/provider.py | 30 ++++++++++------- mfr/extensions/office365/render.py | 18 +++++----- mfr/providers/osf/provider.py | 54 ++++++++++++++++++------------ 4 files changed, 66 insertions(+), 47 deletions(-) diff --git a/mfr/core/exceptions.py b/mfr/core/exceptions.py index 317399568..2b62d5090 100644 --- a/mfr/core/exceptions.py +++ b/mfr/core/exceptions.py @@ -157,10 +157,13 @@ def __init__(self, message, *args, url: str='', code: int=400, **kwargs): super().__init__(message, code=code, *args, **kwargs) self.url = url self.return_code = code - self.attr_stack.append([self.__TYPE, { - 'url': self.url, - 'returncode': self.return_code, - }]) + self.attr_stack.append([ + self.__TYPE, + { + 'url': self.url, + 'returncode': self.return_code, + } + ]) class TooBigToRenderError(ProviderError): diff --git a/mfr/core/provider.py b/mfr/core/provider.py index 66856de01..8aab2d01e 100644 --- a/mfr/core/provider.py +++ b/mfr/core/provider.py @@ -1,15 +1,19 @@ -import abc +from abc import ( + ABCMeta, + abstractmethod, + abstractproperty +) from aiohttp import HttpBadRequest -import furl -import markupsafe +from furl import furl +from markupsafe import escape -from mfr.core import exceptions as core_exceptions +from mfr.core.exceptions import ProviderError from mfr.core.metrics import MetricsRecord -from mfr.server import settings as server_settings +from mfr.server.settings import ALLOWED_PROVIDER_NETLOCS -class BaseProvider(metaclass=abc.ABCMeta): +class BaseProvider(metaclass=ABCMeta): """Base class for MFR Providers. Requires ``download`` and ``metadata`` methods. Validates that the given file url is hosted at a domain listed in `mfr.server.settings.ALLOWED_PROVIDER_DOMAINS`. @@ -17,11 +21,11 @@ class BaseProvider(metaclass=abc.ABCMeta): def __init__(self, request, url, action=None): self.request = request - url_netloc = furl.furl(url).netloc - if url_netloc not in server_settings.ALLOWED_PROVIDER_NETLOCS: - raise core_exceptions.ProviderError( + netloc = furl(url).netloc + if netloc not in ALLOWED_PROVIDER_NETLOCS: + raise ProviderError( message="{} is not a permitted provider domain.".format( - markupsafe.escape(url_netloc) + escape(netloc) ), # TODO: using HTTPStatus.BAD_REQUEST fails tests, not sure why and I will take a another look later code=HttpBadRequest.code @@ -36,15 +40,15 @@ def __init__(self, request, url, action=None): 'url': str(self.url), }) - @abc.abstractproperty + @abstractproperty def NAME(self): raise NotImplementedError - @abc.abstractmethod + @abstractmethod def metadata(self): pass - @abc.abstractmethod + @abstractmethod def download(self): pass diff --git a/mfr/extensions/office365/render.py b/mfr/extensions/office365/render.py index 2c03cb3e1..0f8d8e75f 100644 --- a/mfr/extensions/office365/render.py +++ b/mfr/extensions/office365/render.py @@ -1,14 +1,14 @@ import os from urllib import parse -import furl +from furl import furl from mako.lookup import TemplateLookup -from mfr.core import extension -from mfr.extensions.office365 import settings as office365_settings +from mfr.core.extension import BaseRenderer +from mfr.extensions.office365.settings import OFFICE_BASE_URL -class Office365Renderer(extension.BaseRenderer): +class Office365Renderer(BaseRenderer): """A renderer for .docx files that are publicly available. Office online can render `.docx` files to `.pdf` for us. This renderer will only be made @@ -17,7 +17,6 @@ class Office365Renderer(extension.BaseRenderer): render MUST be public. Note: this renderer DOES NOT work locally. - """ TEMPLATE = TemplateLookup( @@ -26,10 +25,11 @@ class Office365Renderer(extension.BaseRenderer): ]).get_template('viewer.mako') def render(self): - download_url = furl.furl(self.metadata.download_url).set(query='').url - encoded_download_url = parse.quote(download_url) - office_render_url = office365_settings.OFFICE_BASE_URL + encoded_download_url - return self.TEMPLATE.render(base=self.assets_url, url=office_render_url) + download_url = furl(self.metadata.download_url).set(query='').url + return self.TEMPLATE.render( + base=self.assets_url, + url=OFFICE_BASE_URL + parse.quote(download_url) + ) @property def file_required(self): diff --git a/mfr/providers/osf/provider.py b/mfr/providers/osf/provider.py index c3ea3866d..78250cbb4 100644 --- a/mfr/providers/osf/provider.py +++ b/mfr/providers/osf/provider.py @@ -10,23 +10,35 @@ import aiohttp from aiohttp.errors import ContentEncodingError -from waterbutler.core import streams - -from mfr import settings as mfr_settings -from mfr.core import exceptions as core_exceptions -from mfr.core import provider, utils -from mfr.providers.osf import settings as provider_settings +from waterbutler.core.streams import ResponseStreamReader + +from mfr.core.exceptions import ( + DownloadError, + MetadataError, + TooBigToRenderError, + QueryParameterError +) +from mfr.core.provider import ( + BaseProvider, + ProviderMetadata +) +from mfr.core.utils import sizeof_fmt +from mfr.providers.osf.settings import ( + MFR_ACTION_HEADER, + MFR_IDENTIFYING_HEADER +) +from mfr.settings import MAX_FILE_SIZE_TO_RENDER logger = logging.getLogger(__name__) -class OsfProvider(provider.BaseProvider): +class OsfProvider(BaseProvider): """Open Science Framework (https://osf.io) -aware provider. Knows the OSF ecosystem and can request specific metadata for the file referenced by the URL. Can correctly propagate OSF authorization to verify ownership and permisssions of file. """ - UNNEEDED_URL_PARAMS = ('_', 'token', 'action', 'mode', 'displayName') + UNNEEDED_URL_PARAMS = {'_', 'token', 'action', 'mode', 'displayName'} NAME = 'osf' def __init__(self, request, url, action=None): @@ -72,14 +84,14 @@ async def metadata(self): metadata_response = await self._make_request( 'HEAD', download_url, - headers={settings.MFR_ACTION_HEADER: self.action or ''} + headers={MFR_ACTION_HEADER: self.action or ''} ) response_code = metadata_response.status response_reason = metadata_response.reason response_headers = metadata_response.headers await metadata_response.release() if response_code != HTTPStatus.OK: - raise core_exceptions.MetadataError( + raise MetadataError( 'Failed to fetch file metadata from WaterButler. Received response: ', 'code {} {}'.format(str(response_code), str(response_reason)), metadata_url=download_url, @@ -110,12 +122,12 @@ async def metadata(self): name, ext = os.path.splitext(metadata['data']['name']) size = metadata['data']['size'] - max_file_size = mfr_settings.MAX_FILE_SIZE_TO_RENDER.get(ext) + max_file_size = MAX_FILE_SIZE_TO_RENDER.get(ext) if max_file_size and size and int(size) > max_file_size: - raise core_exceptions.TooBigToRenderError( + raise TooBigToRenderError( "This file with extension '{ext}' exceeds the size limit of {max_size} and will not " "be rendered. To view this file download it and view it " - "offline.".format(ext=ext, max_size=utils.sizeof_fmt(max_file_size)), + "offline.".format(ext=ext, max_size=sizeof_fmt(max_file_size)), requested_size=int(size), maximum_size=max_file_size, ) @@ -132,8 +144,8 @@ async def metadata(self): is_public = False public_file = cleaned_url.args.get('public_file', None) if public_file: - if public_file not in ['True', 'False']: - raise core_exceptions.QueryParameterError( + if public_file not in {'True', 'False'}: + raise QueryParameterError( 'Invalid value for query parameter `public_file`: {}'.format(cleaned_url.args['public_file']), url=download_url, provider=self.NAME, @@ -141,7 +153,7 @@ async def metadata(self): ) is_public = public_file == 'True' - return provider.ProviderMetadata( + return ProviderMetadata( name, ext, content_type, @@ -154,13 +166,13 @@ async def metadata(self): async def download(self): """Download file from WaterButler, returning stream.""" download_url = await self._fetch_download_url() - headers = {provider_settings.MFR_IDENTIFYING_HEADER: '1'} + headers = {MFR_IDENTIFYING_HEADER: '1'} response = await self._make_request('GET', download_url, allow_redirects=False, headers=headers) if response.status >= HTTPStatus.BAD_REQUEST: resp_text = await response.text() logger.error('Unable to download file: ({}) {}'.format(response.status, resp_text)) - raise core_exceptions.DownloadError( + raise DownloadError( 'Unable to download the requested file, please try again later.', download_url=download_url, response=resp_text, @@ -168,12 +180,12 @@ async def download(self): ) self.metrics.add('download.saw_redirect', False) - if response.status in (HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND): + if response.status in {HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND}: await response.release() response = await aiohttp.request('GET', response.headers['location']) self.metrics.add('download.saw_redirect', True) - return streams.ResponseStreamReader(response) + return ResponseStreamReader(response) async def _fetch_download_url(self): """Provider needs a WaterButler URL to download and get metadata. If ``url`` is already @@ -202,7 +214,7 @@ async def _fetch_download_url(self): logger.debug('osf-download-resolver: request.status::{}'.format(request.status)) if request.status != HTTPStatus.FOUND: - raise core_exceptions.MetadataError( + raise MetadataError( request.reason, metadata_url=self.url, provider=self.NAME, From 13ee0a99b1e06e7ba5f91f1c16957dcaafdd7614 Mon Sep 17 00:00:00 2001 From: Josh Bird Date: Fri, 3 Aug 2018 15:54:09 -0400 Subject: [PATCH 4/7] Don't import 'escape' directly `markupsafe.escape(url)` is a little more informative to read than just `escape(url)`, so this reverts the change to use the former. --- mfr/core/provider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mfr/core/provider.py b/mfr/core/provider.py index 8aab2d01e..ca1eb4227 100644 --- a/mfr/core/provider.py +++ b/mfr/core/provider.py @@ -6,7 +6,7 @@ from aiohttp import HttpBadRequest from furl import furl -from markupsafe import escape +import markupsafe from mfr.core.exceptions import ProviderError from mfr.core.metrics import MetricsRecord @@ -25,7 +25,7 @@ def __init__(self, request, url, action=None): if netloc not in ALLOWED_PROVIDER_NETLOCS: raise ProviderError( message="{} is not a permitted provider domain.".format( - escape(netloc) + markupsafe.escape(netloc) ), # TODO: using HTTPStatus.BAD_REQUEST fails tests, not sure why and I will take a another look later code=HttpBadRequest.code From b5341f23d523ae7f4e24c681d4100791dc8699fa Mon Sep 17 00:00:00 2001 From: Josh Bird Date: Fri, 3 Aug 2018 16:08:48 -0400 Subject: [PATCH 5/7] Better quoting for long strings Long strings are better quotes with triple quotes so they're more readable, also has the nice effect of lettin strings that have other quotation marks in them not need to be escaped. --- mfr/providers/osf/provider.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mfr/providers/osf/provider.py b/mfr/providers/osf/provider.py index 78250cbb4..1b1e49cc5 100644 --- a/mfr/providers/osf/provider.py +++ b/mfr/providers/osf/provider.py @@ -92,8 +92,11 @@ async def metadata(self): await metadata_response.release() if response_code != HTTPStatus.OK: raise MetadataError( - 'Failed to fetch file metadata from WaterButler. Received response: ', - 'code {} {}'.format(str(response_code), str(response_reason)), + '''Failed to fetch file metadata from WaterButler. + Received response: code {} {}'''.format( + str(response_code), + str(response_reason) + ), metadata_url=download_url, response=response_reason, provider=self.NAME, @@ -125,9 +128,10 @@ async def metadata(self): max_file_size = MAX_FILE_SIZE_TO_RENDER.get(ext) if max_file_size and size and int(size) > max_file_size: raise TooBigToRenderError( - "This file with extension '{ext}' exceeds the size limit of {max_size} and will not " - "be rendered. To view this file download it and view it " - "offline.".format(ext=ext, max_size=sizeof_fmt(max_file_size)), + '''This file with extension '{ext}' exceeds the size limit of {max_size} and will + not be rendered. To view this file download it and view it offline.'''.format( + ext=ext, max_size=sizeof_fmt(max_file_size) + ), requested_size=int(size), maximum_size=max_file_size, ) From 7d3030f7565a0357c989cf44f5f6b9aacb5a86a0 Mon Sep 17 00:00:00 2001 From: Josh Bird Date: Fri, 3 Aug 2018 16:13:00 -0400 Subject: [PATCH 6/7] attr_stack should be a list of tuples This changes th QueryParameterError to append a tuple to its attr stack rather than a list. --- mfr/core/exceptions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mfr/core/exceptions.py b/mfr/core/exceptions.py index 2b62d5090..fb0c86835 100644 --- a/mfr/core/exceptions.py +++ b/mfr/core/exceptions.py @@ -157,13 +157,13 @@ def __init__(self, message, *args, url: str='', code: int=400, **kwargs): super().__init__(message, code=code, *args, **kwargs) self.url = url self.return_code = code - self.attr_stack.append([ + self.attr_stack.append(( self.__TYPE, { 'url': self.url, 'returncode': self.return_code, } - ]) + )) class TooBigToRenderError(ProviderError): From edc9c184c1fc2db2adfaf186d4915589ecfc5211 Mon Sep 17 00:00:00 2001 From: Josh Bird Date: Mon, 6 Aug 2018 13:53:18 -0400 Subject: [PATCH 7/7] Clarify why wont work locally Added a clarifiaction on why the renderer won't work locally. The rendering service needs to access the file thatis to be rendered, and the url waterbutler gives the renderer is going to be inaccessible to the outside world, unless the developer configures waterbutler to use an externally accessible hostname/ip. --- mfr/extensions/office365/render.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mfr/extensions/office365/render.py b/mfr/extensions/office365/render.py index 0f8d8e75f..7b2457369 100644 --- a/mfr/extensions/office365/render.py +++ b/mfr/extensions/office365/render.py @@ -16,7 +16,11 @@ class Office365Renderer(BaseRenderer): office online URL into an `iframe` and returns the template. The file it is trying to render MUST be public. - Note: this renderer DOES NOT work locally. + Note: The url for the file to convert must be available publicly on the + internet in order for the renderer to access it. This means files stored on + OSF storage locally will not render unless the local server is listening on + external connections and waterbutler is providing urls that are externally + accessible. """ TEMPLATE = TemplateLookup(