Skip to content

Commit

Permalink
fuzzer bugfixes
Browse files Browse the repository at this point in the history
  • Loading branch information
eliaskosunen committed Oct 26, 2023
1 parent cb1d19a commit f5f394a
Show file tree
Hide file tree
Showing 17 changed files with 779 additions and 82 deletions.
2 changes: 1 addition & 1 deletion cmake/dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ endif()
FetchContent_Declare(
simdutf
GIT_REPOSITORY https://github.com/simdutf/simdutf.git
GIT_TAG v3.2.14
GIT_TAG v4.0.3
GIT_SHALLOW TRUE
)

Expand Down
8 changes: 5 additions & 3 deletions cmake/options.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ option(SCN_USE_RTTI "Compile with RTTI (run-time type information) support" ON)
option(SCN_USE_NATIVE_ARCH "Add -march=native to build flags (gcc or clang only)" OFF)
option(SCN_USE_HASWELL_ARCH "Add -march=haswell to build flags (gcc or clang only)" OFF)

option(SCN_USE_ASAN "Compile with AddressSanitizer (clang only)" OFF)
option(SCN_USE_UBSAN "Compile with UndefinedBehaviorSanitizer (clang only)" OFF)
option(SCN_USE_MSAN "Compile with MemorySanitizer (clang only)" OFF)
option(SCN_USE_ASAN "Compile with AddressSanitizer" OFF)
option(SCN_USE_UBSAN "Compile with UndefinedBehaviorSanitizer" OFF)
option(SCN_USE_MSAN "Compile with MemorySanitizer" OFF)
option(SCN_USE_STACK_PROTECT "Compile with various stack protection measures (gcc or clang only)" OFF)
option(SCN_USE_SAFESTACK "Compile with SafeStack (clang only, requires STACK_PROTECT, disallows ASAN)" OFF)
17 changes: 17 additions & 0 deletions cmake/sanitizers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,20 @@ endif()
target_compile_options(scn_sanitizer_fuzzer INTERFACE ${FUZZER_FLAG})
target_link_options(scn_sanitizer_fuzzer INTERFACE ${FUZZER_FLAG})

add_library(scn_sanitizer_stackprotect INTERFACE)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(STACKPROTECT_FLAG -g -fstack-protector-all -mshstk)
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(STACKPROTECT_FLAG -g -fstack-protector-all -flto -fvisibility=hidden -fsanitize=cfi)
if (SCN_USE_SAFESTACK)
set(STACKPROTECT_FLAG ${STACKPROTECT_FLAG} -fsanitize=safe-stack)
else()
set(STACKPROTECT_FLAG ${STACKPROTECT_FLAG} -mshstk)
endif()
endif()
target_compile_options(scn_sanitizer_stackprotect INTERFACE ${STACKPROTECT_FLAG})
target_link_options(scn_sanitizer_stackprotect INTERFACE ${STACKPROTECT_FLAG})

add_library(scn_sanitizers INTERFACE)

if (SCN_USE_ASAN)
Expand All @@ -45,6 +59,9 @@ endif()
if (SCN_USE_MSAN)
target_link_libraries(scn_sanitizers INTERFACE scn_sanitizer_msan)
endif()
if (SCN_USE_STACK_PROTECT)
target_link_libraries(scn_sanitizers INTERFACE scn_sanitizer_stackprotect)
endif()

if (SCN_FUZZING)
add_library(scn_fuzzer INTERFACE)
Expand Down
73 changes: 16 additions & 57 deletions src/scn/impl/algorithms/find_whitespace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,54 +28,14 @@ namespace scn {
template <typename R>
bool has_nonascii_char_64(R source)
{
static_assert(sizeof(*source.data()) == 1);
SCN_EXPECT(source.size() <= 8);
uint64_t word{};
std::memcpy(&word, source.data(), source.size());

return has_byte_greater(word, 127) != 0;
}

template <typename R, typename Cb>
auto find_impl_ascii(R sv, Cb cb)
{
return ranges::find_if(sv, cb);
}

template <typename Cb>
auto find_impl_unicode_invalid(std::string_view sv, Cb cb)
{
auto it = sv.data();
while (it != sv.data() + sv.size()) {
auto tmp = std::string_view{
detail::to_address(it),
static_cast<size_t>(ranges::distance(
it, detail::to_address(sv.end())))};
auto res = get_next_code_point(tmp);
if (cb(res.value)) {
break;
}
it += ranges::distance(tmp.data(),
detail::to_address(res.iterator));
}
return sv.begin() + ranges::distance(sv.data(), it);
}

template <typename Cb>
auto find_impl_unicode_valid(
std::string_view sv,
const std::array<char32_t, 8>& codepoints,
Cb cb)
{
for (size_t i = 0; i < codepoints.size(); ++i) {
if (cb(codepoints[i])) {
return sv.begin() + static_cast<std::ptrdiff_t>(
simdutf::utf8_length_from_utf32(
codepoints.data(), i));
}
}
return sv.end();
}

template <typename CuCb, typename CpCb>
std::string_view::iterator
find_classic_impl(std::string_view source, CuCb cu_cb, CpCb cp_cb)
Expand All @@ -84,37 +44,36 @@ namespace scn {
const auto end = source.data() + source.size();

while (it != end) {
SCN_EXPECT(it < end);
auto sv =
std::string_view{
it, static_cast<size_t>(ranges::distance(
it, detail::to_address(source.end())))}
.substr(0, 8);

if (!has_nonascii_char_64(sv)) {
auto i = find_impl_ascii(sv, cu_cb);
auto i = ranges::find_if(sv, cu_cb);
it = detail::to_address(i);
if (i != sv.end()) {
break;
}
continue;
}

std::array<char32_t, 8> codepoints{};
auto ret = simdutf::convert_utf8_to_utf32(
detail::to_address(it), sv.size(), codepoints.data());
if (SCN_UNLIKELY(ret == 0)) {
auto i = find_impl_unicode_invalid(sv, cp_cb);
it = detail::to_address(i);
if (i != sv.end()) {
break;
for (size_t i = 0; i < sv.size(); ++i) {
auto tmp = std::string_view{
detail::to_address(it),
static_cast<size_t>(ranges::distance(it, end))};
auto res = get_next_code_point(tmp);
if (cp_cb(res.value)) {
return source.begin() +
ranges::distance(source.data(), it);
}
continue;
}

auto i = find_impl_unicode_valid(sv, codepoints, cp_cb);
it = detail::to_address(i);
if (i != sv.end()) {
break;
auto n = ranges::distance(
tmp.data(), detail::to_address(res.iterator));
it += n;
i += n;
SCN_ENSURE(it <= end);
}
}

Expand Down
6 changes: 5 additions & 1 deletion src/scn/impl/reader/integer_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ namespace scn {

if (SCN_UNLIKELY(m_locale_options.thousands_sep != 0)) {
auto it = ranges::begin(range);
bool digit_matched = false;
for (; it != ranges::end(range); ++it) {
if (*it == m_locale_options.thousands_sep) {
m_thsep_indices.push_back(static_cast<char>(
Expand All @@ -318,8 +319,11 @@ namespace scn {
m_base) {
break;
}
else {
digit_matched = true;
}
}
if (it == ranges::begin(range)) {
if (SCN_UNLIKELY(!digit_matched)) {
return unexpected_scan_error(
scan_error::invalid_scanned_value,
"No matching characters");
Expand Down
113 changes: 93 additions & 20 deletions src/scn/impl/unicode/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,18 +116,37 @@ namespace scn {
{
SCN_EXPECT(!input.empty());

const auto len = code_point_length_by_starting_code_unit(input[0]);
if (SCN_UNLIKELY(len == 0)) {
return detail::invalid_code_point;
}

constexpr auto enc = get_encoding<CharT>();
char32_t output{};
size_t ret{};
if constexpr (enc == encoding::utf8) {
#ifndef NDEBUG
if (validate_unicode(input)) {
SCN_EXPECT(simdutf::utf32_length_from_utf8(
reinterpret_cast<const char*>(input.data()),
input.size()) == 1);
}
#endif
ret = simdutf::convert_utf8_to_utf32(
reinterpret_cast<const char*>(input.data()), input.size(),
&output);
reinterpret_cast<const char*>(input.data()), len, &output);
}
else if constexpr (enc == encoding::utf16) {
#ifndef NDEBUG
if (validate_unicode(input)) {
SCN_EXPECT(
simdutf::utf32_length_from_utf16(
reinterpret_cast<const char16_t*>(input.data()),
input.size()) == 1);
}
#endif
ret = simdutf::convert_utf16_to_utf32(
reinterpret_cast<const char16_t*>(input.data()),
input.size(), &output);
reinterpret_cast<const char16_t*>(input.data()), len,
&output);
}
else if constexpr (enc == encoding::utf32) {
output = static_cast<char32_t>(input[0]);
Expand Down Expand Up @@ -258,6 +277,9 @@ namespace scn {
return {get_start_of_next_code_point(input),
detail::invalid_code_point};
}
if (SCN_UNLIKELY(len > input.size())) {
return {input.end(), detail::invalid_code_point};
}

constexpr auto enc = get_encoding<CharT>();
std::size_t result{1};
Expand All @@ -284,6 +306,51 @@ namespace scn {
return {input.begin() + len, output};
}

template <typename CharT>
auto find_start_of_next_valid_code_point(
std::basic_string_view<CharT> input)
-> ranges::iterator_t<std::basic_string_view<CharT>>
{
auto it = input.begin();
while (it != input.end()) {
const auto len = code_point_length_by_starting_code_unit(*it);
if (len == 0) {
++it;
continue;
}
if (len > ranges::distance(it, input.end())) {
return input.end();
}

bool is_valid_cp =
[&]() {
constexpr auto enc = get_encoding<CharT>();
if constexpr (enc == encoding::utf8) {
return simdutf::validate_utf8_with_errors(
detail::to_address(it), len);
}
else if (enc == encoding::utf16) {
return simdutf::validate_utf16_with_errors(
reinterpret_cast<const char16_t*>(
detail::to_address(it)),
len);
}
else if (enc == encoding::utf32) {
return simdutf::validate_utf32_with_errors(
reinterpret_cast<const char32_t*>(
detail::to_address(it)),
len);
}
}()
.error == simdutf::SUCCESS;
if (is_valid_cp) {
return it;
}
it += len;
}
return input.end();
}

template <typename CharT>
std::size_t count_valid_code_points(std::basic_string_view<CharT> input)
{
Expand Down Expand Up @@ -520,7 +587,7 @@ namespace scn {

auto it = source.begin();
std::array<DestCharT,
32 * std::max(sizeof(DestCharT) / sizeof(SourceCharT),
32 * std::max(sizeof(SourceCharT) / sizeof(DestCharT),
size_t{1})>
tmp{};
while (it != source.end()) {
Expand All @@ -530,28 +597,34 @@ namespace scn {

auto tmp_view = span<DestCharT>{tmp.data(), tmp.size()};
auto res = do_transcode(sv, tmp_view);
if (SCN_UNLIKELY(res.error != simdutf::SUCCESS)) {
if (SCN_LIKELY(res.error == simdutf::SUCCESS)) {
dest.append(tmp.data(), std::min(res.count, tmp.size()));
it = sv.end();
continue;
}

{
auto valid_sv = sv.substr(0, res.count);
auto n =
count_valid_transcoded_code_units<DestCharT>(valid_sv);
// Replacement character U+fffd
if constexpr (sizeof(DestCharT) == 1) {
tmp_view[n + 0] = static_cast<DestCharT>(0xef);
tmp_view[n + 1] = static_cast<DestCharT>(0xbf);
tmp_view[n + 2] = static_cast<DestCharT>(0xbd);
tmp_view = tmp_view.first(n + 3);
}
else {
tmp_view[n + 0] = static_cast<DestCharT>(0xfffd);
tmp_view = tmp_view.first(n + 1);
}
dest.append(tmp.data(), n);
it += res.count;
}

// Replacement character U+fffd
if constexpr (sizeof(DestCharT) == 1) {
tmp[0] = static_cast<DestCharT>(0xef);
tmp[1] = static_cast<DestCharT>(0xbf);
tmp[2] = static_cast<DestCharT>(0xbd);
dest.append(tmp.data(), 3);
}
else {
tmp_view = tmp_view.first(res.count);
dest.push_back(DestCharT{0xfffd});
}

dest.append(tmp_view.data(), tmp_view.size());
it = sv.end();
it = find_start_of_next_valid_code_point(
detail::make_string_view_from_iterators<SourceCharT>(
it, source.end()));
}
}

Expand Down
3 changes: 3 additions & 0 deletions tests/fuzz/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ add_fuzzer(string)
add_fuzzer(format)
add_fuzzer(roundtrip)

add_fuzzer(string_impl)
target_include_directories(scn_fuzz_string_impl PRIVATE "${PROJECT_SOURCE_DIR}/src")

add_custom_target(scn_fuzz_prepare ALL
COMMAND ${CMAKE_COMMAND} -E copy
"${CMAKE_CURRENT_LIST_DIR}/run-fuzz.sh"
Expand Down
2 changes: 2 additions & 0 deletions tests/fuzz/run-fuzz.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
fuzzer=$1
if [[ $fuzzer = "roundtrip" ]]; then
data="int"
elif [[ $fuzzer = "string_impl" ]]; then
data="string"
else
data=$fuzzer
fi
Expand Down
Loading

0 comments on commit f5f394a

Please sign in to comment.