scrapy · okomestudio · Jul 27, 2018 · Feb 5, 2020 · Feb 6, 2020 · Feb 11, 2020
diff --git a/tests/test_http.py b/tests/test_http.py
@@ -28,12 +28,42 @@ def test_headers_raw_dict_none(self):
         self.assertIsNone(headers_dict_to_raw(None))
 
     def test_headers_raw_to_dict(self):
-        raw = b"Content-type: text/html\n\rAccept: gzip\n\r\
-                Cache-Control: no-cache\n\rCache-Control: no-store\n\n"
-        dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip'], 
+        raw = b'\r\n'.join((b"Content-type: text/html",
+                            b"Accept: gzip",
+                            b"Cache-Control: no-cache",
+                            b"Cache-Control: no-store"))
+        dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip'],
                b'Cache-Control': [b'no-cache', b'no-store']}
         self.assertEqual(headers_raw_to_dict(raw), dct)
 
+    def test_headers_raw_to_dict_multiline(self):
+        raw = b'\r\n'.join((b'Content-Type: multipart/related;',
+                            b'  type="application/xop+xml";',
+                            b'\tboundary="example"',
+                            b'Cache-Control: no-cache'))
+        # With strict=False, the header value that spans across
+        # multiple lines does not get parsed fully, and only the first
+        # line is retained.
+        dct = {b'Content-Type': [b'multipart/related;'],
+               b'Cache-Control': [b'no-cache']}
+        self.assertEqual(headers_raw_to_dict(raw), dct)
+
+    def test_headers_raw_to_dict_multiline_strict(self):
+        raw = b'\r\n'.join((b'Content-Type: multipart/related;',
+                            b'  type="application/xop+xml";',
+                            b'\tboundary="example"',
+                            b'Cache-Control: no-cache'))
+        # With strict=True, the header value that spans across
+        # multiple lines does get parsed fully.
+        dct = {
+            b'Content-Type': [
+                b'\r\n'.join((b'multipart/related;',
+                              b'  type="application/xop+xml";',
+                              b'\tboundary="example"'))
+            ],
+            b'Cache-Control': [b'no-cache']}
+        self.assertEqual(headers_raw_to_dict(raw, strict=True), dct)
+
     def test_headers_dict_to_raw(self):
         dct = OrderedDict([
             (b'Content-type', b'text/html'),

diff --git a/w3lib/http.py b/w3lib/http.py
@@ -1,11 +1,17 @@
 from base64 import urlsafe_b64encode
 
 
-def headers_raw_to_dict(headers_raw):
+def headers_raw_to_dict(headers_raw, strict=False):
     r"""
     Convert raw headers (single multi-line bytestring)
     to a dictionary.
 
+    `strict` is a bool parameter controlling the multi-line parsing behavior.
+    If 'True', only the character sequence '\r\n' is considered as the line
+    delimiter, as per the HTTP specification (e.g., RFC 2616). If 'False'
+    (default), lines are delimited by 'str.splitlines()' and a wider range
+    of character(s) are considered as line boundaries.
+
     For example:
 
     >>> import w3lib.http
@@ -27,7 +33,20 @@ def headers_raw_to_dict(headers_raw):
 
     if headers_raw is None:
         return None
-    headers = headers_raw.splitlines()
+
+    if strict:
+        headers = []
+        for line in headers_raw.split(b'\r\n'):
+            if line.startswith(b' ') or line.startswith(b'\t'):
+                try:
+                    headers[-1] += (b'\r\n' + line)
+                except IndexError:
+                    raise ValueError('Malformed raw headers')
+            else:
+                headers.append(line)
+    else:
+        headers = headers_raw.splitlines()
+
     headers_tuples = [header.split(b':', 1) for header in headers]
 
     result_dict = {}