Skip to content

Commit

Permalink
Implement a response size limit option
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Jun 7, 2019
1 parent a729023 commit a836917
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 18 deletions.
3 changes: 3 additions & 0 deletions splash/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

MAX_TIMEOUT = 90.0

RESPONSE_SIZE_LIMIT = None
MAX_RESPONSE_SIZE_LIMIT = None

# Default size of browser window. As there're no decorations, this affects
# both "window.inner*" and "window.outer*" values.
VIEWPORT_SIZE = '1024x768'
Expand Down
95 changes: 95 additions & 0 deletions splash/network_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,34 @@
)
from splash.response_middleware import ContentTypeMiddleware
from splash import defaults
from splash.qtutils import qt_header_items
from splash.utils import to_bytes
from splash.cookies import SplashCookieJar


class _InvalidContentLength(ValueError):

def __init__(self, value):
if isinstance(value, bytes):
value = '0x' + value.hex()
message = 'Invalid Content-Length header value: {}'.format(value)
super().__init__(message)


def _get_content_length(reply):
for name, value in qt_header_items(reply):
if bytes(name).lower() == b'content-length':
value = bytes(value).split(b',', 1)[0]
try:
value = value.decode('latin1')
value = int(value)
except (UnicodeDecodeError, ValueError):
raise _InvalidContentLength(value)
if value < 0:
raise _InvalidContentLength(value)
return value


class NetworkManagerFactory(object):
def __init__(self, filters_path=None, verbosity=None, allowed_schemes=None, disable_browser_caches=None):
verbosity = defaults.VERBOSITY if verbosity is None else verbosity
Expand Down Expand Up @@ -86,6 +110,7 @@ class ProxiedQNetworkAccessManager(QNetworkAccessManager):
* Tracks information about requests/responses and stores it in HAR format,
including request and response content.
* Allows to set per-request timeouts.
* Handles per-request response size limits.
"""
_REQUEST_ID = QNetworkRequest.User + 1
_SHOULD_TRACK = QNetworkRequest.User + 2
Expand Down Expand Up @@ -398,11 +423,71 @@ def _on_reply_finished(self):
content)
self.log("Finished downloading {url}", reply)

def _aborted_due_to_size(self, sizes_and_sources):
reply = self.sender()
request = reply.request()
render_options = self._get_render_options(request)
if render_options is None:
return False
option = "response_size_limit"
max_size = render_options.get(option, None)
if max_size is not None:
try:
max_size = int(max_size)
except ValueError:
self.log("Non-integer value received for rendering option "
"'{}': {}".format(option, max_size), min_level=1)
self.log(traceback.format_exc(), min_level=1, format_msg=False)
max_size = None
else:
if max_size < 0:
self.log("The value of rendering option '{}' ({}) must be "
"0 or higher.".format(option, max_size),
min_level=1)
max_size = None
elif (render_options.max_response_size_limit is not None and
max_size > render_options.max_response_size_limit):
self.log("The value of rendering option '{}' ({}) exceeds "
"the maximum value allowed.".format(
option, max_size),
min_level=1)
max_size = None
if max_size is None:
if render_options.max_response_size_limit is not None:
max_size = render_options.max_response_size_limit
else:
max_size = defaults.RESPONSE_SIZE_LIMIT
if max_size is None:
return False
for size, source in sizes_and_sources:
if size is None:
continue
if size <= max_size:
continue
self.log("The {} ({}) exceeds the maximum response size ({}), "
"aborting: {{url}}".format(source, size, max_size),
reply, min_level=1)
self.log(render_options, reply, min_level=1, format_msg=False)
reply.abort()
return True
return False

def _on_reply_headers(self):
"""Signal emitted before reading response body, after getting headers
"""
reply = self.sender()
request = reply.request()

try:
content_length = _get_content_length(reply)
except _InvalidContentLength as error:
self.log("On response from {{url}}: {}".format(error),
reply, min_level=3)
content_length = None
sizes_and_sources = ((content_length, "Content-Length header"),)
if self._aborted_due_to_size(sizes_and_sources):
return

self._handle_reply_cookies(reply)
self._run_webpage_callbacks(request, "on_response_headers", reply)

Expand All @@ -413,6 +498,16 @@ def _on_reply_headers(self):
self.log("Headers received for {url}", reply, min_level=3)

def _on_reply_download_progress(self, received, total):
reply = self.sender()
request = reply.request()

sizes_and_sources = (
(total, "expected response size"),
(received, "size of the response content downloaded so far"),
)
if self._aborted_due_to_size(sizes_and_sources):
return

har = self._get_har()
if har is not None:
req_id = self._get_request_id()
Expand Down
7 changes: 4 additions & 3 deletions splash/render_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ class RenderOptions(object):

_REQUIRED = object()

def __init__(self, data, max_timeout):
def __init__(self, data, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT):
self.data = data
self.max_response_size_limit = max_response_size_limit
self.max_timeout = max_timeout

@classmethod
Expand All @@ -29,7 +30,7 @@ def raise_error(cls, argument, description, type='bad_argument', **kwargs):
raise BadOption(params)

@classmethod
def fromrequest(cls, request, max_timeout):
def fromrequest(cls, request, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT):
"""
Initialize options from a Twisted Request.
"""
Expand Down Expand Up @@ -60,7 +61,7 @@ def fromrequest(cls, request, max_timeout):
request.content.seek(0)

data['uid'] = id(request)
return cls(data, max_timeout)
return cls(data, max_timeout, max_response_size_limit=max_response_size_limit)

def get_expired_args(self, cache):
"""
Expand Down
37 changes: 26 additions & 11 deletions splash/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import splash
from splash.argument_cache import ArgumentCache
from splash import defaults
from splash.qtrender import (
HtmlRender, PngRender, JsonRender, HarRender, JpegRender
)
Expand Down Expand Up @@ -85,17 +86,18 @@ class BaseRenderResource(_ValidatingResource):
isLeaf = True
content_type = "text/html; charset=utf-8"

def __init__(self, pool, max_timeout, argument_cache):
def __init__(self, pool, max_timeout, argument_cache, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT):
Resource.__init__(self)
self.pool = pool
self.js_profiles_path = self.pool.js_profiles_path
self.max_timeout = max_timeout
self.argument_cache = argument_cache
self.max_response_size_limit = max_response_size_limit

def render_GET(self, request):
#log.msg("%s %s %s %s" % (id(request), request.method, request.path, request.args))
request.starttime = time.time()
render_options = RenderOptions.fromrequest(request, self.max_timeout)
render_options = RenderOptions.fromrequest(request, self.max_timeout, max_response_size_limit=self.max_response_size_limit)

# process argument cache
original_options = render_options.data.copy()
Expand Down Expand Up @@ -281,8 +283,9 @@ def __init__(self, pool, sandboxed,
argument_cache,
strict,
implicit_main,
max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT,
):
BaseRenderResource.__init__(self, pool, max_timeout, argument_cache)
BaseRenderResource.__init__(self, pool, max_timeout, argument_cache, max_response_size_limit=max_response_size_limit)
self.sandboxed = sandboxed
self.lua_package_path = lua_package_path
self.lua_sandbox_allowed_modules = lua_sandbox_allowed_modules
Expand Down Expand Up @@ -434,20 +437,22 @@ class DemoUI(_ValidatingResource):

PATH = b'info'

def __init__(self, pool, lua_enabled, max_timeout):
def __init__(self, pool, lua_enabled, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT):
Resource.__init__(self)
self.pool = pool
self.lua_enabled = lua_enabled
self.max_timeout = max_timeout
self.max_response_size_limit = max_response_size_limit

def _validate_params(self, request):
options = RenderOptions.fromrequest(request, self.max_timeout)
options = RenderOptions.fromrequest(request, self.max_timeout, max_response_size_limit=self.max_response_size_limit)
options.get_filters(self.pool) # check
params = options.get_common_params(self.pool.js_profiles_path)
params.update({
'save_args': options.get_save_args(),
'load_args': options.get_load_args(),
'timeout': options.get_timeout(),
'response_size_limit': options.get_response_size_limit(),
'request_body': options.get_request_body(),
'response_body': options.get_response_body(),
'har': 1,
Expand All @@ -471,6 +476,7 @@ def render_GET(self, request):
url = 'http://' + url
params['url'] = url
timeout = params['timeout']
response_size_limit = params['response_size_limit']
params = {k: v for k, v in params.items() if v is not None}

# disable "phases" HAR Viewer feature
Expand Down Expand Up @@ -514,6 +520,7 @@ def render_GET(self, request):
<input type="hidden" name="images" value="1">
<input type="hidden" name="expand" value="1"> <!-- for HAR viewer -->
<input type="hidden" name="timeout" value="%(timeout)s">
<input type="hidden" name="response_size_limit" value="%(response_size_limit)s">
<div class="btn-group" id="render-form">
<input class="form-control col-lg-8" type="text" placeholder="Paste an URL" type="text" name="url" value="%(url)s">
Expand Down Expand Up @@ -563,6 +570,7 @@ def render_GET(self, request):
"lua_enabled": self.lua_enabled,
}),
timeout=timeout,
response_size_limit=response_size_limit,
url=url,
theme=BOOTSTRAP_THEME,
cm_resources=CODEMIRROR_RESOURCES if self.lua_enabled else "",
Expand All @@ -576,18 +584,20 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled,
max_timeout,
argument_cache_max_entries,
strict_lua_runner,
max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT,
):
Resource.__init__(self)
self.argument_cache = ArgumentCache(argument_cache_max_entries)
self.ui_enabled = ui_enabled
self.lua_enabled = lua_enabled

_args = pool, max_timeout, self.argument_cache
self.putChild(b"render.html", RenderHtmlResource(*_args))
self.putChild(b"render.png", RenderPngResource(*_args))
self.putChild(b"render.jpeg", RenderJpegResource(*_args))
self.putChild(b"render.json", RenderJsonResource(*_args))
self.putChild(b"render.har", RenderHarResource(*_args))
_kwargs = {'max_response_size_limit': max_response_size_limit}
self.putChild(b"render.html", RenderHtmlResource(*_args, **_kwargs))
self.putChild(b"render.png", RenderPngResource(*_args, **_kwargs))
self.putChild(b"render.jpeg", RenderJpegResource(*_args, **_kwargs))
self.putChild(b"render.json", RenderJsonResource(*_args, **_kwargs))
self.putChild(b"render.har", RenderHarResource(*_args, **_kwargs))

self.putChild(b"_debug", DebugResource(pool, self.argument_cache))
self.putChild(b"_gc", ClearCachesResource(self.argument_cache))
Expand All @@ -605,6 +615,7 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled,
max_timeout=max_timeout,
argument_cache=self.argument_cache,
strict=strict_lua_runner,
max_response_size_limit=max_response_size_limit,
)
self.putChild(b"execute", ExecuteLuaScriptResource(
implicit_main=False, **lua_kwargs))
Expand All @@ -626,9 +637,11 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled,
self.putChild(DemoUI.PATH, DemoUI(
pool=pool,
lua_enabled=self.lua_enabled,
max_timeout=max_timeout
max_timeout=max_timeout,
max_response_size_limit=max_response_size_limit,
))
self.max_timeout = max_timeout
self.max_response_size_limit = max_response_size_limit

def getChild(self, name, request):
if name == b"" and self.ui_enabled:
Expand Down Expand Up @@ -720,6 +733,7 @@ def render_GET(self, request):
<input type="hidden" name="images" value="1">
<input type="hidden" name="expand" value="1"> <!-- for HAR viewer -->
<input type="hidden" name="timeout" value="%(timeout)s">
<input type="hidden" name="response_size_limit" value="%(response_size_limit)s">
<fieldset>
<div class="">
Expand Down Expand Up @@ -754,5 +768,6 @@ def render_GET(self, request):
}),
cm_resources=CODEMIRROR_RESOURCES,
timeout=self.max_timeout,
response_size_limit=self.max_response_size_limit,
)
return result.encode('utf8')
17 changes: 13 additions & 4 deletions splash/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ def parse_opts(jupyter=False, argv=sys.argv):
help="number of render slots (default: %default)")
op.add_option("--max-timeout", type="float", default=defaults.MAX_TIMEOUT,
help="maximum allowed value for timeout (default: %default)")
op.add_option("--max-response-size-limit", type="int",
default=defaults.MAX_RESPONSE_SIZE_LIMIT,
help="maximum allowed value for response size limit (default: %default)")
op.add_option("--disable-ui", action="store_true", default=False,
help="disable web UI")
op.add_option("--disable-lua", action="store_true", default=False,
Expand All @@ -94,6 +97,7 @@ def parse_opts(jupyter=False, argv=sys.argv):
opts.port = None
opts.slots = None
opts.max_timeout = None
opts.max_response_size_limit = None
opts.argument_cache_max_entries = None

return opts, args
Expand Down Expand Up @@ -170,7 +174,8 @@ def splash_server(portnum, ip, slots, network_manager_factory, max_timeout,
strict_lua_runner=False,
argument_cache_max_entries=None,
disable_browser_caches=False,
verbosity=None):
verbosity=None,
max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT):
from twisted.internet import reactor
from twisted.web.server import Site
from splash.resources import Root
Expand All @@ -181,8 +186,8 @@ def splash_server(portnum, ip, slots, network_manager_factory, max_timeout,
verbosity = defaults.VERBOSITY if verbosity is None else verbosity
slots = defaults.SLOTS if slots is None else slots

log.msg("verbosity={}, slots={}, argument_cache_max_entries={}, max-timeout={}".format(
verbosity, slots, argument_cache_max_entries, max_timeout
log.msg("verbosity={}, slots={}, argument_cache_max_entries={}, max-timeout={}, max-response-size-limit={}".format(
verbosity, slots, argument_cache_max_entries, max_timeout, max_response_size_limit
))

pool = RenderPool(
Expand Down Expand Up @@ -215,6 +220,7 @@ def splash_server(portnum, ip, slots, network_manager_factory, max_timeout,
max_timeout=max_timeout,
argument_cache_max_entries=argument_cache_max_entries,
strict_lua_runner=strict_lua_runner,
max_response_size_limit=max_response_size_limit,
)
factory = Site(root)
reactor.listenTCP(portnum, factory, interface=ip)
Expand Down Expand Up @@ -264,6 +270,7 @@ def default_splash_server(portnum, ip, max_timeout, slots=None,
verbosity=None,
server_factory=splash_server,
disable_browser_caches=False,
max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT,
):
from splash import network_manager
network_manager_factory = network_manager.NetworkManagerFactory(
Expand Down Expand Up @@ -293,6 +300,7 @@ def default_splash_server(portnum, ip, max_timeout, slots=None,
verbosity=verbosity,
max_timeout=max_timeout,
argument_cache_max_entries=argument_cache_max_entries,
max_response_size_limit=max_response_size_limit,
)


Expand Down Expand Up @@ -391,7 +399,8 @@ def main(jupyter=False, argv=sys.argv, server_factory=splash_server):
max_timeout=opts.max_timeout,
argument_cache_max_entries=opts.argument_cache_max_entries,
server_factory=server_factory,
disable_browser_caches=opts.disable_browser_caches
disable_browser_caches=opts.disable_browser_caches,
max_response_size_limit=opts.max_response_size_limit,
)
signal.signal(signal.SIGUSR1, lambda s, f: traceback.print_stack(f))

Expand Down
Loading

0 comments on commit a836917

Please sign in to comment.