Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Implement a maximum response size option #902

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions splash/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

MAX_TIMEOUT = 90.0

RESPONSE_SIZE_LIMIT = None
MAX_RESPONSE_SIZE_LIMIT = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This two looks very alike so a bit confusing, please add a comment about what each they stands for?

BTW why do we need two variables for this limit?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I recall right:

  • RESPONSE_SIZE_LIMIT is the limit to use as default if the Splash request does not specify one.

  • MAX_RESPONSE_SIZE_LIMIT is the maximum value that a Splash request may specify. If a higher value is specified by a Splash request, MAX_RESPONSE_SIZE_LIMIT is used as limit instead.

It is indeed confusing, I should indeed add a comment.


# Default size of browser window. As there're no decorations, this affects
# both "window.inner*" and "window.outer*" values.
VIEWPORT_SIZE = '1024x768'
Expand Down
99 changes: 99 additions & 0 deletions splash/network_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,77 @@
)
from splash.response_middleware import ContentTypeMiddleware
from splash import defaults
from splash.qtutils import qt_header_items
from splash.utils import to_bytes
from splash.cookies import SplashCookieJar


class _InvalidContentLength(ValueError):

def __init__(self, value):
if isinstance(value, bytes):
value = '0x' + value.hex()
message = 'Invalid Content-Length header value: {}'.format(value)
super().__init__(message)


def _get_content_length(reply):
for name, value in qt_header_items(reply):
if bytes(name).lower() == b'content-length':
value = bytes(value).split(b',', 1)[0]
try:
value = value.decode('latin1')
value = int(value)
except (UnicodeDecodeError, ValueError):
raise _InvalidContentLength(value)
if value < 0:
raise _InvalidContentLength(value)
return value


def _size_warrants_abort(sizes_and_sources, render_options, log, reply):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function looks over-complicated, can we improve it somehow?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I can extract some of the inner logic into separate functions, such as the parsing of max_size.

if render_options is None:
return False
option = "response_size_limit"
max_size = render_options.get(option, None)
if max_size is not None:
try:
max_size = int(max_size)
except ValueError:
log("Non-integer value received for rendering option '{}': {}"
.format(option, max_size), min_level=1)
log(traceback.format_exc(), min_level=1, format_msg=False)
max_size = None
else:
if max_size < 0:
log("The value of rendering option '{}' ({}) must be 0 or "
"higher.".format(option, max_size),min_level=1)
max_size = None
elif (render_options.max_response_size_limit is not None and
max_size > render_options.max_response_size_limit):
log("The value of rendering option '{}' ({}) exceeds the "
"maximum value allowed.".format(option, max_size),
min_level=1)
max_size = None
if max_size is None:
if render_options.max_response_size_limit is not None:
max_size = render_options.max_response_size_limit
else:
max_size = defaults.RESPONSE_SIZE_LIMIT
if max_size is None:
return False
for size, source in sizes_and_sources:
if size is None:
continue
if size <= max_size:
continue
log("The {} ({}) exceeds the maximum response size ({}), aborting: "
"{{url}}".format(source, size, max_size), reply, min_level=1)
log(render_options, reply, min_level=1, format_msg=False)
return True
return False


class NetworkManagerFactory(object):
def __init__(self, filters_path=None, verbosity=None, allowed_schemes=None, disable_browser_caches=None):
verbosity = defaults.VERBOSITY if verbosity is None else verbosity
Expand Down Expand Up @@ -86,6 +153,7 @@ class ProxiedQNetworkAccessManager(QNetworkAccessManager):
* Tracks information about requests/responses and stores it in HAR format,
including request and response content.
* Allows to set per-request timeouts.
* Handles per-request response size limits.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't find how to set the "per-request response size limit" in this PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By adding max_size to the render options of a request.

If you mean that the documentation should explain this, indeed.

"""
_REQUEST_ID = QNetworkRequest.User + 1
_SHOULD_TRACK = QNetworkRequest.User + 2
Expand Down Expand Up @@ -398,11 +466,32 @@ def _on_reply_finished(self):
content)
self.log("Finished downloading {url}", reply)

def _size_caused_abort(self, sizes_and_sources):
reply = self.sender()
request = reply.request()
render_options = self._get_render_options(request)
if _size_warrants_abort(
sizes_and_sources, render_options, self.log, reply):
reply.abort()
return True
return False

def _on_reply_headers(self):
"""Signal emitted before reading response body, after getting headers
"""
reply = self.sender()
request = reply.request()

try:
content_length = _get_content_length(reply)
except _InvalidContentLength as error:
self.log("On response from {{url}}: {}".format(error),
reply, min_level=3)
content_length = None
sizes_and_sources = ((content_length, "Content-Length header"),)
if self._size_caused_abort(sizes_and_sources):
return

self._handle_reply_cookies(reply)
self._run_webpage_callbacks(request, "on_response_headers", reply)

Expand All @@ -413,6 +502,16 @@ def _on_reply_headers(self):
self.log("Headers received for {url}", reply, min_level=3)

def _on_reply_download_progress(self, received, total):
reply = self.sender()
request = reply.request()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

request is not used


sizes_and_sources = (
(total, "expected response size"),
(received, "size of the response content downloaded so far"),
)
if self._size_caused_abort(sizes_and_sources):
return

har = self._get_har()
if har is not None:
req_id = self._get_request_id()
Expand Down
7 changes: 4 additions & 3 deletions splash/render_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ class RenderOptions(object):

_REQUIRED = object()

def __init__(self, data, max_timeout):
def __init__(self, data, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT):
self.data = data
self.max_response_size_limit = max_response_size_limit
self.max_timeout = max_timeout

@classmethod
Expand All @@ -29,7 +30,7 @@ def raise_error(cls, argument, description, type='bad_argument', **kwargs):
raise BadOption(params)

@classmethod
def fromrequest(cls, request, max_timeout):
def fromrequest(cls, request, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT):
"""
Initialize options from a Twisted Request.
"""
Expand Down Expand Up @@ -60,7 +61,7 @@ def fromrequest(cls, request, max_timeout):
request.content.seek(0)

data['uid'] = id(request)
return cls(data, max_timeout)
return cls(data, max_timeout, max_response_size_limit=max_response_size_limit)

def get_expired_args(self, cache):
"""
Expand Down
37 changes: 26 additions & 11 deletions splash/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import splash
from splash.argument_cache import ArgumentCache
from splash import defaults
from splash.qtrender import (
HtmlRender, PngRender, JsonRender, HarRender, JpegRender
)
Expand Down Expand Up @@ -85,17 +86,18 @@ class BaseRenderResource(_ValidatingResource):
isLeaf = True
content_type = "text/html; charset=utf-8"

def __init__(self, pool, max_timeout, argument_cache):
def __init__(self, pool, max_timeout, argument_cache, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT):
Resource.__init__(self)
self.pool = pool
self.js_profiles_path = self.pool.js_profiles_path
self.max_timeout = max_timeout
self.argument_cache = argument_cache
self.max_response_size_limit = max_response_size_limit

def render_GET(self, request):
#log.msg("%s %s %s %s" % (id(request), request.method, request.path, request.args))
request.starttime = time.time()
render_options = RenderOptions.fromrequest(request, self.max_timeout)
render_options = RenderOptions.fromrequest(request, self.max_timeout, max_response_size_limit=self.max_response_size_limit)

# process argument cache
original_options = render_options.data.copy()
Expand Down Expand Up @@ -281,8 +283,9 @@ def __init__(self, pool, sandboxed,
argument_cache,
strict,
implicit_main,
max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT,
):
BaseRenderResource.__init__(self, pool, max_timeout, argument_cache)
BaseRenderResource.__init__(self, pool, max_timeout, argument_cache, max_response_size_limit=max_response_size_limit)
self.sandboxed = sandboxed
self.lua_package_path = lua_package_path
self.lua_sandbox_allowed_modules = lua_sandbox_allowed_modules
Expand Down Expand Up @@ -434,20 +437,22 @@ class DemoUI(_ValidatingResource):

PATH = b'info'

def __init__(self, pool, lua_enabled, max_timeout):
def __init__(self, pool, lua_enabled, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT):
Resource.__init__(self)
self.pool = pool
self.lua_enabled = lua_enabled
self.max_timeout = max_timeout
self.max_response_size_limit = max_response_size_limit

def _validate_params(self, request):
options = RenderOptions.fromrequest(request, self.max_timeout)
options = RenderOptions.fromrequest(request, self.max_timeout, max_response_size_limit=self.max_response_size_limit)
options.get_filters(self.pool) # check
params = options.get_common_params(self.pool.js_profiles_path)
params.update({
'save_args': options.get_save_args(),
'load_args': options.get_load_args(),
'timeout': options.get_timeout(),
'response_size_limit': options.get_response_size_limit(),
'request_body': options.get_request_body(),
'response_body': options.get_response_body(),
'har': 1,
Expand All @@ -471,6 +476,7 @@ def render_GET(self, request):
url = 'http://' + url
params['url'] = url
timeout = params['timeout']
response_size_limit = params['response_size_limit']
params = {k: v for k, v in params.items() if v is not None}

# disable "phases" HAR Viewer feature
Expand Down Expand Up @@ -514,6 +520,7 @@ def render_GET(self, request):
<input type="hidden" name="images" value="1">
<input type="hidden" name="expand" value="1"> <!-- for HAR viewer -->
<input type="hidden" name="timeout" value="%(timeout)s">
<input type="hidden" name="response_size_limit" value="%(response_size_limit)s">
<div class="btn-group" id="render-form">
<input class="form-control col-lg-8" type="text" placeholder="Paste an URL" type="text" name="url" value="%(url)s">
Expand Down Expand Up @@ -563,6 +570,7 @@ def render_GET(self, request):
"lua_enabled": self.lua_enabled,
}),
timeout=timeout,
response_size_limit=response_size_limit,
url=url,
theme=BOOTSTRAP_THEME,
cm_resources=CODEMIRROR_RESOURCES if self.lua_enabled else "",
Expand All @@ -576,18 +584,20 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled,
max_timeout,
argument_cache_max_entries,
strict_lua_runner,
max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT,
):
Resource.__init__(self)
self.argument_cache = ArgumentCache(argument_cache_max_entries)
self.ui_enabled = ui_enabled
self.lua_enabled = lua_enabled

_args = pool, max_timeout, self.argument_cache
self.putChild(b"render.html", RenderHtmlResource(*_args))
self.putChild(b"render.png", RenderPngResource(*_args))
self.putChild(b"render.jpeg", RenderJpegResource(*_args))
self.putChild(b"render.json", RenderJsonResource(*_args))
self.putChild(b"render.har", RenderHarResource(*_args))
_kwargs = {'max_response_size_limit': max_response_size_limit}
self.putChild(b"render.html", RenderHtmlResource(*_args, **_kwargs))
self.putChild(b"render.png", RenderPngResource(*_args, **_kwargs))
self.putChild(b"render.jpeg", RenderJpegResource(*_args, **_kwargs))
self.putChild(b"render.json", RenderJsonResource(*_args, **_kwargs))
self.putChild(b"render.har", RenderHarResource(*_args, **_kwargs))

self.putChild(b"_debug", DebugResource(pool, self.argument_cache))
self.putChild(b"_gc", ClearCachesResource(self.argument_cache))
Expand All @@ -605,6 +615,7 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled,
max_timeout=max_timeout,
argument_cache=self.argument_cache,
strict=strict_lua_runner,
max_response_size_limit=max_response_size_limit,
)
self.putChild(b"execute", ExecuteLuaScriptResource(
implicit_main=False, **lua_kwargs))
Expand All @@ -626,9 +637,11 @@ def __init__(self, pool, ui_enabled, lua_enabled, lua_sandbox_enabled,
self.putChild(DemoUI.PATH, DemoUI(
pool=pool,
lua_enabled=self.lua_enabled,
max_timeout=max_timeout
max_timeout=max_timeout,
max_response_size_limit=max_response_size_limit,
))
self.max_timeout = max_timeout
self.max_response_size_limit = max_response_size_limit

def getChild(self, name, request):
if name == b"" and self.ui_enabled:
Expand Down Expand Up @@ -720,6 +733,7 @@ def render_GET(self, request):
<input type="hidden" name="images" value="1">
<input type="hidden" name="expand" value="1"> <!-- for HAR viewer -->
<input type="hidden" name="timeout" value="%(timeout)s">
<input type="hidden" name="response_size_limit" value="%(response_size_limit)s">
<fieldset>
<div class="">
Expand Down Expand Up @@ -754,5 +768,6 @@ def render_GET(self, request):
}),
cm_resources=CODEMIRROR_RESOURCES,
timeout=self.max_timeout,
response_size_limit=self.max_response_size_limit,
)
return result.encode('utf8')
17 changes: 13 additions & 4 deletions splash/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ def parse_opts(jupyter=False, argv=sys.argv):
help="number of render slots (default: %default)")
op.add_option("--max-timeout", type="float", default=defaults.MAX_TIMEOUT,
help="maximum allowed value for timeout (default: %default)")
op.add_option("--max-response-size-limit", type="int",
default=defaults.MAX_RESPONSE_SIZE_LIMIT,
help="maximum allowed value for response size limit (default: %default)")
op.add_option("--disable-ui", action="store_true", default=False,
help="disable web UI")
op.add_option("--disable-lua", action="store_true", default=False,
Expand All @@ -94,6 +97,7 @@ def parse_opts(jupyter=False, argv=sys.argv):
opts.port = None
opts.slots = None
opts.max_timeout = None
opts.max_response_size_limit = None
opts.argument_cache_max_entries = None

return opts, args
Expand Down Expand Up @@ -170,7 +174,8 @@ def splash_server(portnum, ip, slots, network_manager_factory, max_timeout,
strict_lua_runner=False,
argument_cache_max_entries=None,
disable_browser_caches=False,
verbosity=None):
verbosity=None,
max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT):
from twisted.internet import reactor
from twisted.web.server import Site
from splash.resources import Root
Expand All @@ -181,8 +186,8 @@ def splash_server(portnum, ip, slots, network_manager_factory, max_timeout,
verbosity = defaults.VERBOSITY if verbosity is None else verbosity
slots = defaults.SLOTS if slots is None else slots

log.msg("verbosity={}, slots={}, argument_cache_max_entries={}, max-timeout={}".format(
verbosity, slots, argument_cache_max_entries, max_timeout
log.msg("verbosity={}, slots={}, argument_cache_max_entries={}, max-timeout={}, max-response-size-limit={}".format(
verbosity, slots, argument_cache_max_entries, max_timeout, max_response_size_limit
))

pool = RenderPool(
Expand Down Expand Up @@ -215,6 +220,7 @@ def splash_server(portnum, ip, slots, network_manager_factory, max_timeout,
max_timeout=max_timeout,
argument_cache_max_entries=argument_cache_max_entries,
strict_lua_runner=strict_lua_runner,
max_response_size_limit=max_response_size_limit,
)
factory = Site(root)
reactor.listenTCP(portnum, factory, interface=ip)
Expand Down Expand Up @@ -264,6 +270,7 @@ def default_splash_server(portnum, ip, max_timeout, slots=None,
verbosity=None,
server_factory=splash_server,
disable_browser_caches=False,
max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT,
):
from splash import network_manager
network_manager_factory = network_manager.NetworkManagerFactory(
Expand Down Expand Up @@ -293,6 +300,7 @@ def default_splash_server(portnum, ip, max_timeout, slots=None,
verbosity=verbosity,
max_timeout=max_timeout,
argument_cache_max_entries=argument_cache_max_entries,
max_response_size_limit=max_response_size_limit,
)


Expand Down Expand Up @@ -391,7 +399,8 @@ def main(jupyter=False, argv=sys.argv, server_factory=splash_server):
max_timeout=opts.max_timeout,
argument_cache_max_entries=opts.argument_cache_max_entries,
server_factory=server_factory,
disable_browser_caches=opts.disable_browser_caches
disable_browser_caches=opts.disable_browser_caches,
max_response_size_limit=opts.max_response_size_limit,
)
signal.signal(signal.SIGUSR1, lambda s, f: traceback.print_stack(f))

Expand Down
Loading