From 1b31cc6569e9f1e1a13b406007d6b887e7cb1dce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 31 May 2019 10:55:29 +0200 Subject: [PATCH] Implement a response size limit option --- splash/defaults.py | 3 +++ splash/network_manager.py | 57 +++++++++++++++++++++++++++++++++++++++ splash/render_options.py | 20 +++++++++++--- splash/resources.py | 37 +++++++++++++++++-------- splash/server.py | 18 ++++++++++--- 5 files changed, 117 insertions(+), 18 deletions(-) diff --git a/splash/defaults.py b/splash/defaults.py index 05bc15ac4..9a51006f4 100644 --- a/splash/defaults.py +++ b/splash/defaults.py @@ -6,6 +6,9 @@ MAX_TIMEOUT = 90.0 +RESPONSE_SIZE_LIMIT = None +MAX_RESPONSE_SIZE_LIMIT = None + # Default size of browser window. As there're no decorations, this affects # both "window.inner*" and "window.outer*" values. VIEWPORT_SIZE = '1024x768' diff --git a/splash/network_manager.py b/splash/network_manager.py index 29108a505..310af4608 100644 --- a/splash/network_manager.py +++ b/splash/network_manager.py @@ -26,10 +26,29 @@ ) from splash.response_middleware import ContentTypeMiddleware from splash import defaults +from splash.qtutils import qt_header_items from splash.utils import to_bytes from splash.cookies import SplashCookieJar +def _get_content_length(reply, log): + for name, value in qt_header_items(reply): + if bytes(name).lower() == b'content-length': + value = bytes(value) + try: + value = value.decode('latin1') + except UnicodeDecodeError: + log("Received a non-ASCII Content-Length header for {{url}}: " + "{}".format(value.hex()), reply, min_level=3) + else: + try: + return int(value) + except ValueError: + log("Received a non-integer Content-Length header for " + "{{url}}: {}".format(value), reply, min_level=3) + break + + class NetworkManagerFactory(object): def __init__(self, filters_path=None, verbosity=None, allowed_schemes=None, disable_browser_caches=None): verbosity = defaults.VERBOSITY if verbosity is None else verbosity @@ -86,6 +105,7 @@ class ProxiedQNetworkAccessManager(QNetworkAccessManager): * Tracks information about requests/responses and stores it in HAR format, including request and response content. * Allows to set per-request timeouts. + * Handles per-request response size limits. """ _REQUEST_ID = QNetworkRequest.User + 1 _SHOULD_TRACK = QNetworkRequest.User + 2 @@ -403,6 +423,21 @@ def _on_reply_headers(self): """ reply = self.sender() request = reply.request() + + content_length = _get_content_length(reply, self.log) + if content_length is not None: + render_options = self._get_render_options(request) + max_size = render_options.get_response_size_limit() + if max_size is not None and content_length > max_size: + self.log( + "The Content-Length header ({}) exceeds the maximum " + "response size ({}), aborting: {{url}}".format( + content_length, max_size), + reply, min_level=1) + self.log(render_options, reply, min_level=1, format_msg=False) + reply.abort() + return + self._handle_reply_cookies(reply) self._run_webpage_callbacks(request, "on_response_headers", reply) @@ -413,6 +448,28 @@ def _on_reply_headers(self): self.log("Headers received for {url}", reply, min_level=3) def _on_reply_download_progress(self, received, total): + reply = self.sender() + request = reply.request() + + def _abort_due_to_size(size, description): + self.log( + "{} ({}) exceeds the maximum response size ({}), aborting: " + "{{url}}".format(description, size, max_size), + reply, min_level=1) + self.log(render_options, reply, min_level=1, format_msg=False) + reply.abort() + return + + render_options = self._get_render_options(request) + max_size = render_options.get_response_size_limit() + if max_size is not None: + if total > max_size: + return _abort_due_to_size(total, "The expected response size") + if received > max_size: + return _abort_due_to_size(received, + "The size of the response content " + "downloaded so far") + har = self._get_har() if har is not None: req_id = self._get_request_id() diff --git a/splash/render_options.py b/splash/render_options.py index f85e30b47..a3c763ec6 100644 --- a/splash/render_options.py +++ b/splash/render_options.py @@ -14,8 +14,9 @@ class RenderOptions(object): _REQUIRED = object() - def __init__(self, data, max_timeout): + def __init__(self, data, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): self.data = data + self.max_response_size_limit = max_response_size_limit self.max_timeout = max_timeout @classmethod @@ -29,7 +30,7 @@ def raise_error(cls, argument, description, type='bad_argument', **kwargs): raise BadOption(params) @classmethod - def fromrequest(cls, request, max_timeout): + def fromrequest(cls, request, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): """ Initialize options from a Twisted Request. """ @@ -60,7 +61,7 @@ def fromrequest(cls, request, max_timeout): request.content.seek(0) data['uid'] = id(request) - return cls(data, max_timeout) + return cls(data, max_timeout, max_response_size_limit=max_response_size_limit) def get_expired_args(self, cache): """ @@ -136,6 +137,18 @@ def get_resource_timeout(self): def get_response_body(self): return self._get_bool("response_body", defaults.RESPONSE_BODY_ENABLED) + def get_response_size_limit(self): + if self.max_response_size_limit is not None: + default = self.max_response_size_limit + else: + default = defaults.RESPONSE_SIZE_LIMIT + value = self.get("response_size_limit", default, type=int) + if value is not None: + value = max(0, value) + if self.max_response_size_limit is not None: + value = min(self.max_response_size_limit, value) + return value + def get_request_body(self): return self._get_bool("request_body", defaults.REQUEST_BODY_ENABLED) @@ -360,6 +373,7 @@ def get_common_params(self, js_profiles_path): 'baseurl': self.get_baseurl(), 'wait': wait, 'resource_timeout': self.get_resource_timeout(), + 'response_size_limit': self.get_response_size_limit(), 'viewport': self.get_viewport(wait), 'render_all': self.get_render_all(wait), 'images': self.get_images(), diff --git a/splash/resources.py b/splash/resources.py index 2c205ef5b..d57bd79e6 100644 --- a/splash/resources.py +++ b/splash/resources.py @@ -17,6 +17,7 @@ import splash from splash.argument_cache import ArgumentCache +from splash import defaults from splash.qtrender import ( HtmlRender, PngRender, JsonRender, HarRender, JpegRender ) @@ -85,17 +86,18 @@ class BaseRenderResource(_ValidatingResource): isLeaf = True content_type = "text/html; charset=utf-8" - def __init__(self, pool, max_timeout, argument_cache): + def __init__(self, pool, max_timeout, argument_cache, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): Resource.__init__(self) self.pool = pool self.js_profiles_path = self.pool.js_profiles_path self.max_timeout = max_timeout self.argument_cache = argument_cache + self.max_response_size_limit = max_response_size_limit def render_GET(self, request): #log.msg("%s %s %s %s" % (id(request), request.method, request.path, request.args)) request.starttime = time.time() - render_options = RenderOptions.fromrequest(request, self.max_timeout) + render_options = RenderOptions.fromrequest(request, self.max_timeout, max_response_size_limit=self.max_response_size_limit) # process argument cache original_options = render_options.data.copy() @@ -281,8 +283,9 @@ def __init__(self, pool, sandboxed, argument_cache, strict, implicit_main, + max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT, ): - BaseRenderResource.__init__(self, pool, max_timeout, argument_cache) + BaseRenderResource.__init__(self, pool, max_timeout, argument_cache, max_response_size_limit=max_response_size_limit) self.sandboxed = sandboxed self.lua_package_path = lua_package_path self.lua_sandbox_allowed_modules = lua_sandbox_allowed_modules @@ -434,20 +437,22 @@ class DemoUI(_ValidatingResource): PATH = b'info' - def __init__(self, pool, lua_enabled, max_timeout): + def __init__(self, pool, lua_enabled, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): Resource.__init__(self) self.pool = pool self.lua_enabled = lua_enabled self.max_timeout = max_timeout + self.max_response_size_limit = max_response_size_limit def _validate_params(self, request): - options = RenderOptions.fromrequest(request, self.max_timeout) + options = RenderOptions.fromrequest(request, self.max_timeout, max_response_size_limit=self.max_response_size_limit) options.get_filters(self.pool) # check params = options.get_common_params(self.pool.js_profiles_path) params.update({ 'save_args': options.get_save_args(), 'load_args': options.get_load_args(), 'timeout': options.get_timeout(), + 'response_size_limit': options.get_response_size_limit(), 'request_body': options.get_request_body(), 'response_body': options.get_response_body(), 'har': 1, @@ -471,6 +476,7 @@ def render_GET(self, request): url = 'http://' + url params['url'] = url timeout = params['timeout'] + response_size_limit = params['response_size_limit'] params = {k: v for k, v in params.items() if v is not None} # disable "phases" HAR Viewer feature @@ -514,6 +520,7 @@ def render_GET(self, request): +
@@ -563,6 +570,7 @@ def render_GET(self, request): "lua_enabled": self.lua_enabled, }), timeout=timeout, + response_size_limit=response_size_limit, url=url, theme=BOOTSTRAP_THEME, cm_resources=CODEMIRROR_RESOURCES if self.lua_enabled else "", @@ -576,6 +584,7 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled, max_timeout, argument_cache_max_entries, strict_lua_runner, + max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT, ): Resource.__init__(self) self.argument_cache = ArgumentCache(argument_cache_max_entries) @@ -583,11 +592,12 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled, self.lua_enabled = lua_enabled _args = pool, max_timeout, self.argument_cache - self.putChild(b"render.html", RenderHtmlResource(*_args)) - self.putChild(b"render.png", RenderPngResource(*_args)) - self.putChild(b"render.jpeg", RenderJpegResource(*_args)) - self.putChild(b"render.json", RenderJsonResource(*_args)) - self.putChild(b"render.har", RenderHarResource(*_args)) + _kwargs = {'max_response_size_limit': max_response_size_limit} + self.putChild(b"render.html", RenderHtmlResource(*_args, **_kwargs)) + self.putChild(b"render.png", RenderPngResource(*_args, **_kwargs)) + self.putChild(b"render.jpeg", RenderJpegResource(*_args, **_kwargs)) + self.putChild(b"render.json", RenderJsonResource(*_args, **_kwargs)) + self.putChild(b"render.har", RenderHarResource(*_args, **_kwargs)) self.putChild(b"_debug", DebugResource(pool, self.argument_cache)) self.putChild(b"_gc", ClearCachesResource(self.argument_cache)) @@ -605,6 +615,7 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled, max_timeout=max_timeout, argument_cache=self.argument_cache, strict=strict_lua_runner, + max_response_size_limit=max_response_size_limit, ) self.putChild(b"execute", ExecuteLuaScriptResource( implicit_main=False, **lua_kwargs)) @@ -626,9 +637,11 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled, self.putChild(DemoUI.PATH, DemoUI( pool=pool, lua_enabled=self.lua_enabled, - max_timeout=max_timeout + max_timeout=max_timeout, + max_response_size_limit=max_response_size_limit, )) self.max_timeout = max_timeout + self.max_response_size_limit = max_response_size_limit def getChild(self, name, request): if name == b"" and self.ui_enabled: @@ -720,6 +733,7 @@ def render_GET(self, request): +
@@ -754,5 +768,6 @@ def render_GET(self, request): }), cm_resources=CODEMIRROR_RESOURCES, timeout=self.max_timeout, + response_size_limit=self.max_response_size_limit, ) return result.encode('utf8') diff --git a/splash/server.py b/splash/server.py index 1aa288a58..1f926e318 100644 --- a/splash/server.py +++ b/splash/server.py @@ -78,6 +78,9 @@ def parse_opts(jupyter=False, argv=sys.argv): help="number of render slots (default: %default)") op.add_option("--max-timeout", type="float", default=defaults.MAX_TIMEOUT, help="maximum allowed value for timeout (default: %default)") + op.add_option("--max-response-size-limit", type="int", + default=defaults.MAX_RESPONSE_SIZE_LIMIT, + help="maximum allowed value for response size limit (default: %default)") op.add_option("--disable-ui", action="store_true", default=False, help="disable web UI") op.add_option("--disable-lua", action="store_true", default=False, @@ -94,6 +97,7 @@ def parse_opts(jupyter=False, argv=sys.argv): opts.port = None opts.slots = None opts.max_timeout = None + opts.max_response_size_limit = None opts.argument_cache_max_entries = None return opts, args @@ -170,7 +174,8 @@ def splash_server(portnum, ip, slots, network_manager_factory, max_timeout, strict_lua_runner=False, argument_cache_max_entries=None, disable_browser_caches=False, - verbosity=None): + verbosity=None, + max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): from twisted.internet import reactor from twisted.web.server import Site from splash.resources import Root @@ -181,8 +186,8 @@ def splash_server(portnum, ip, slots, network_manager_factory, max_timeout, verbosity = defaults.VERBOSITY if verbosity is None else verbosity slots = defaults.SLOTS if slots is None else slots - log.msg("verbosity={}, slots={}, argument_cache_max_entries={}, max-timeout={}".format( - verbosity, slots, argument_cache_max_entries, max_timeout + log.msg("verbosity={}, slots={}, argument_cache_max_entries={}, max-timeout={}, max-response-size-limit={}".format( + verbosity, slots, argument_cache_max_entries, max_timeout, max_response_size_limit )) pool = RenderPool( @@ -215,6 +220,7 @@ def splash_server(portnum, ip, slots, network_manager_factory, max_timeout, max_timeout=max_timeout, argument_cache_max_entries=argument_cache_max_entries, strict_lua_runner=strict_lua_runner, + max_response_size_limit=max_response_size_limit, ) factory = Site(root) reactor.listenTCP(portnum, factory, interface=ip) @@ -264,6 +270,7 @@ def default_splash_server(portnum, ip, max_timeout, slots=None, verbosity=None, server_factory=splash_server, disable_browser_caches=False, + max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT, ): from splash import network_manager network_manager_factory = network_manager.NetworkManagerFactory( @@ -271,6 +278,7 @@ def default_splash_server(portnum, ip, max_timeout, slots=None, verbosity=verbosity, allowed_schemes=allowed_schemes, disable_browser_caches=disable_browser_caches, + max_response_size_limit=max_response_size_limit, ) splash_proxy_factory_cls = _default_proxy_factory(proxy_profiles_path) js_profiles_path = _check_js_profiles_path(js_profiles_path) @@ -293,6 +301,7 @@ def default_splash_server(portnum, ip, max_timeout, slots=None, verbosity=verbosity, max_timeout=max_timeout, argument_cache_max_entries=argument_cache_max_entries, + max_response_size_limit=max_response_size_limit, ) @@ -391,7 +400,8 @@ def main(jupyter=False, argv=sys.argv, server_factory=splash_server): max_timeout=opts.max_timeout, argument_cache_max_entries=opts.argument_cache_max_entries, server_factory=server_factory, - disable_browser_caches=opts.disable_browser_caches + disable_browser_caches=opts.disable_browser_caches, + max_response_size_limit=opts.max_response_size_limit, ) signal.signal(signal.SIGUSR1, lambda s, f: traceback.print_stack(f))