From 9ba0197e72d3803ac2ba91f908db6db3f374b5a8 Mon Sep 17 00:00:00 2001
From: Kevin Wilfong <kevinwilfong@meta.com>
Date: Thu, 21 Nov 2024 16:47:50 -0800
Subject: [PATCH] fix: Match Presto's behavior for invalid UTF-8 in url_decode
 (#11604)

Summary:
Pull Request resolved: https://github.com/facebookincubator/velox/pull/11604

Presto Java converts the URL to a Java String after decoding it in url_decode. Java replaces bytes
in an invalid UTF-8 character with 0xEF 0xBF 0xBD.

Velox decodes invalid UTF-8 characters as is, which leads to differences in results from Java and
C++.

This diff adds a check when decoding URLs for invalid UTF-8 characters and does the same
replacement as Java.

Reviewed By: xiaoxmeng

Differential Revision: D66249265

fbshipit-source-id: d6993f8c70e72368b85b4c64814a45a1359f0802
---
 velox/functions/lib/Utf8Utils.cpp             |   6 -
 velox/functions/lib/Utf8Utils.h               |   4 +
 velox/functions/prestosql/URLFunctions.h      | 158 +++++++++++++-----
 .../prestosql/tests/URLFunctionsTest.cpp      |  32 ++++
 4 files changed, 154 insertions(+), 46 deletions(-)
diff --git a/velox/functions/lib/Utf8Utils.cpp b/velox/functions/lib/Utf8Utils.cpp
index 02354db6cbc1..2b027b6f2caf 100644
--- a/velox/functions/lib/Utf8Utils.cpp
+++ b/velox/functions/lib/Utf8Utils.cpp
@@ -18,10 +18,6 @@
 #include "velox/external/utf8proc/utf8procImpl.h"
 
 namespace facebook::velox::functions {
-namespace {
-
-// Returns the length of a UTF-8 character indicated by the first byte. Returns
-// -1 for invalid UTF-8 first byte.
 int firstByteCharLength(const char* u_input) {
   auto u = (const unsigned char*)u_input;
   unsigned char u0 = u[0];
@@ -59,8 +55,6 @@ int firstByteCharLength(const char* u_input) {
   return -1;
 }
 
-} // namespace
-
 int32_t
 tryGetUtf8CharLength(const char* input, int64_t size, int32_t& codePoint) {
   VELOX_DCHECK_NOT_NULL(input);
diff --git a/velox/functions/lib/Utf8Utils.h b/velox/functions/lib/Utf8Utils.h
index eb994219ec36..3c8950ee369a 100644
--- a/velox/functions/lib/Utf8Utils.h
+++ b/velox/functions/lib/Utf8Utils.h
@@ -82,4 +82,8 @@ FOLLY_ALWAYS_INLINE int validateAndGetNextUtf8Length(
   return -1;
 }
 
+/// Returns the length of a UTF-8 character indicated by the first byte. Returns
+/// -1 for invalid UTF-8 first byte.
+int firstByteCharLength(const char* u_input);
+
 } // namespace facebook::velox::functions
diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h
index 1ce0c42d0e4a..372bfdef2142 100644
--- a/velox/functions/prestosql/URLFunctions.h
+++ b/velox/functions/prestosql/URLFunctions.h
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <boost/regex.hpp>
+#include "velox/external/utf8proc/utf8procImpl.h"
 #include "velox/functions/Macros.h"
 #include "velox/functions/lib/Utf8Utils.h"
 #include "velox/functions/prestosql/URIParser.h"
@@ -30,6 +31,13 @@ constexpr std::array<std::string_view, 6> kEncodedReplacementCharacterStrings =
      "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD",
      "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD",
      "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD"};
+constexpr std::array<std::string_view, 6> kDecodedReplacementCharacterStrings{
+    "\xef\xbf\xbd",
+    "\xef\xbf\xbd\xef\xbf\xbd",
+    "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
+    "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
+    "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
+    "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"};
 
 FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
   const auto& sub = match[idx];
@@ -46,6 +54,29 @@ FOLLY_ALWAYS_INLINE void charEscape(unsigned char c, char* output) {
   output[2] = toHex(c % 16);
 }
 
+template <typename T>
+FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences(
+    const T& inputBuffer,
+    size_t inputIndex) {
+  return
+      // 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
+      // value less than 0x90 is considered an overlong encoding.
+      (inputBuffer[inputIndex] == '\xe0' &&
+       (inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
+      (inputBuffer[inputIndex] == '\xf0' &&
+       (inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
+      // 0xf4 followed by a byte >= 0x90 looks valid to
+      // tryGetUtf8CharLength, but is actually outside the range of valid
+      // code points.
+      (inputBuffer[inputIndex] == '\xf4' &&
+       (inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
+      // The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
+      // multi-byte code points to tryGetUtf8CharLength, but are not part of
+      // any valid code point.
+      (unsigned char)inputBuffer[inputIndex] > 0xf4 ||
+      inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1';
+}
+
 /// Escapes ``input`` by encoding it so that it can be safely included in
 /// URL query parameter names and values:
 ///
@@ -99,31 +130,10 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) {
         // encodings or subsequences outside the range of valid 4 byte
         // sequences.  In both these cases we should just write out a
         // replacement character for every byte in the sequence.
-        size_t replaceCharactersToWriteOut = 1;
-        if (inputIndex < inputSize - 1) {
-          bool isMultipleInvalidSequences =
-              // 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
-              // value less than 0x90 is considered an overlong encoding.
-              (inputBuffer[inputIndex] == '\xe0' &&
-               (inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
-              (inputBuffer[inputIndex] == '\xf0' &&
-               (inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
-              // 0xf4 followed by a byte >= 0x90 looks valid to
-              // tryGetUtf8CharLength, but is actually outside the range of
-              // valid code points.
-              (inputBuffer[inputIndex] == '\xf4' &&
-               (inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
-              // The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
-              // multi-byte code points to tryGetUtf8CharLength, but are not
-              // part of any valid code point.
-              (unsigned char)inputBuffer[inputIndex] > 0xf4 ||
-              inputBuffer[inputIndex] == '\xc0' ||
-              inputBuffer[inputIndex] == '\xc1';
-
-          if (isMultipleInvalidSequences) {
-            replaceCharactersToWriteOut = charLength * -1;
-          }
-        }
+        size_t replaceCharactersToWriteOut = inputIndex < inputSize - 1 &&
+                isMultipleInvalidSequences(inputBuffer, inputIndex)
+            ? -charLength
+            : 1;
 
         const auto& replacementCharacterString =
             kEncodedReplacementCharacterStrings
@@ -141,6 +151,28 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) {
   output.resize(outIndex);
 }
 
+FOLLY_ALWAYS_INLINE char decodeByte(const char* p, const char* end) {
+  char buf[3];
+  buf[2] = '\0';
+
+  if (p + 2 < end) {
+    buf[0] = p[1];
+    buf[1] = p[2];
+    p += 2;
+
+    char* endptr;
+    char val = strtol(buf, &endptr, 16);
+
+    if (endptr != buf + 2) {
+      VELOX_USER_FAIL("Illegal hex characters in escape (%) pattern: {}", buf);
+    }
+
+    return val;
+  } else {
+    VELOX_USER_FAIL("Incomplete trailing escape (%) pattern");
+  }
+}
+
 template <typename TOutString, typename TInString, bool unescapePlus = false>
 FOLLY_ALWAYS_INLINE void urlUnescape(
     TOutString& output,
@@ -151,9 +183,7 @@ FOLLY_ALWAYS_INLINE void urlUnescape(
   auto outputBuffer = output.data();
   const char* p = input.data();
   const char* end = p + inputSize;
-  char buf[3];
-  buf[2] = '\0';
-  char* endptr;
+
   for (; p < end; ++p) {
     if constexpr (unescapePlus) {
       if (*p == '+') {
@@ -162,20 +192,68 @@ FOLLY_ALWAYS_INLINE void urlUnescape(
       }
     }
     if (*p == '%') {
-      if (p + 2 < end) {
-        buf[0] = p[1];
-        buf[1] = p[2];
-        int val = strtol(buf, &endptr, 16);
-        if (endptr == buf + 2) {
-          *outputBuffer++ = (char)val;
-          p += 2;
-        } else {
-          VELOX_USER_FAIL(
-              "Illegal hex characters in escape (%) pattern: {}", buf);
-        }
+      char firstByte = decodeByte(p, end);
+      int32_t charLength = firstByteCharLength(&firstByte);
+
+      if (charLength == 1) {
+        // This is an ASCII character, just write it out.
+        *outputBuffer++ = firstByte;
+      } else if (charLength < 0) {
+        // This isn't the start of a valid UTF-8 character, write out the
+        // replacement character.
+        const auto& replacementString = kDecodedReplacementCharacterStrings[0];
+        std::memcpy(
+            outputBuffer, replacementString.data(), replacementString.length());
+        outputBuffer += replacementString.length();
       } else {
-        VELOX_USER_FAIL("Incomplete trailing escape (%) pattern");
+        char* charStart = outputBuffer;
+        *outputBuffer++ = firstByte;
+        int32_t charLengthRemaining = charLength - 1;
+
+        // Iterate over each percent encoded byte of the UTF-8 character.
+        while (charLengthRemaining > 0 && p + 3 < end && *(p + 3) == '%') {
+          char val = decodeByte(p + 3, end);
+
+          if (!utf_cont(val)) {
+            // If the byte is not a continuation character this is not valid
+            // UTF-8 abort so we can write out replacement character(s).
+            break;
+          }
+
+          // Skip over the previous percent encoded value in the input. We only
+          // do this after checking if the current byte is valid because if the
+          // current byte is invalid, it might be a valid byte in the next code
+          // point.
+          p += 3;
+          *outputBuffer++ = val;
+          charLengthRemaining--;
+        }
+
+        int32_t codePoint;
+        if (charLengthRemaining > 0 ||
+            tryGetUtf8CharLength(charStart, charLength, codePoint) < 0) {
+          // If we exited the loop early it means we encountered a byte that
+          // wasn't part of a valid UTF-8 code point. If tryGetUtf8CharLength
+          // returns a negative value it means even though the bytes looked like
+          // valid UTF-8 they were not, e.g. they were an overlong code point.
+          size_t charLength = outputBuffer - charStart;
+          size_t replaceCharactersToWriteOut =
+              isMultipleInvalidSequences(charStart, 0) ? charLength : 1;
+          const auto& replacementString = kDecodedReplacementCharacterStrings
+              [replaceCharactersToWriteOut - 1];
+
+          outputBuffer = charStart;
+          std::memcpy(
+              outputBuffer,
+              replacementString.data(),
+              replacementString.length());
+          outputBuffer += replacementString.length();
+        }
       }
+
+      // Skip over the last percent encoded value in the code point (the for
+      // loop will handle skipping over the third char).
+      p += 2;
     } else {
       *outputBuffer++ = *p;
     }
diff --git a/velox/functions/prestosql/tests/URLFunctionsTest.cpp b/velox/functions/prestosql/tests/URLFunctionsTest.cpp
index 3fb18c22e7b1..6802105c137c 100644
--- a/velox/functions/prestosql/tests/URLFunctionsTest.cpp
+++ b/velox/functions/prestosql/tests/URLFunctionsTest.cpp
@@ -612,6 +612,38 @@ TEST_F(URLFunctionsTest, urlDecode) {
       urlDecode("http%3A%2F%2F%E3%83%86%E3%82%B9%E3%83%88"));
   EXPECT_EQ("~@:.-*_+ \u2603", urlDecode("%7E%40%3A.-*_%2B+%E2%98%83"));
   EXPECT_EQ("test", urlDecode("test"));
+  // Test a single byte invalid UTF-8 character.
+  EXPECT_EQ("te\xef\xbf\xbdst", urlDecode("te%88st"));
+  // Test a multi-byte invalid UTF-8 character. (If the first byte is between
+  // 0xe0 and 0xef, it should be a 3 byte character, but we only have 2 bytes
+  // here.)
+  EXPECT_EQ("te\xef\xbf\xbdst", urlDecode("te%e0%b8st"));
+  // Test an overlong 3 byte UTF-8 character
+  EXPECT_EQ("\xef\xbf\xbd\xef\xbf\xbd", urlDecode("%e0%94"));
+  // Test an overlong 3 byte UTF-8 character with a continuation byte.
+  EXPECT_EQ("\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd", urlDecode("%e0%94%83"));
+  // Test an overlong 4 byte UTF-8 character
+  EXPECT_EQ("\xef\xbf\xbd\xef\xbf\xbd", urlDecode("%f0%84"));
+  // Test an overlong 4 byte UTF-8 character with continuation bytes.
+  EXPECT_EQ(
+      "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
+      urlDecode("%f0%84%90%90"));
+  // Test a 4 byte UTF-8 character outside the range of valid values.
+  EXPECT_EQ(
+      "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
+      urlDecode("%fa%80%80%80"));
+  // Test the beginning of a 4 byte UTF-8 character followed by a
+  // non-continuation byte.
+  EXPECT_EQ("\xef\xbf\xbd\xef\xbf\xbd", urlDecode("%f0%e0"));
+  // Test the invalid byte 0xc0.
+  EXPECT_EQ("\xef\xbf\xbd\xef\xbf\xbd", urlDecode("%c0%83"));
+  // Test the invalid byte 0xc1.
+  EXPECT_EQ("\xef\xbf\xbd\xef\xbf\xbd", urlDecode("%c1%83"));
+  // Test a 4 byte UTF-8 character that looks valid, but is actually outside the
+  // range of valid values.
+  EXPECT_EQ(
+      "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
+      urlDecode("%f4%92%83%83"));
 
   EXPECT_THROW(urlDecode("http%3A%2F%2"), VeloxUserError);
   EXPECT_THROW(urlDecode("http%3A%2F%"), VeloxUserError);