From 11330d28e7d634fa3590575c50362b6e00044fa2 Mon Sep 17 00:00:00 2001 From: demianmnave Date: Tue, 25 May 2021 21:55:35 -0400 Subject: [PATCH 01/15] Use CMake policy 0091 to ensure MSVC uses a static runtime for building tests --- CMakeLists.txt | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 878b871..07a6082 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,16 @@ # @@COPYRIGHT@@ #*------------------------------------------------------------------------- +# Build options: + +# Build tests if requested: +option(CML_BUILD_TESTING "Build CML tests" OFF) + +# CMAKE_MSVC_RUNTIME_LIBRARY +if(MSVC AND CML_BUILD_TESTING) + cmake_policy(SET CMP0091 NEW) +endif() + # Set the minimum CMake version: cmake_minimum_required(VERSION 3.15) @@ -35,8 +45,6 @@ message(STATUS "Building CML ${CML_VERSION}") # Create the CML interface library: include(CML.cmake) -# Build tests if requested: -option(CML_BUILD_TESTING "Build CML tests" OFF) if(CML_BUILD_TESTING) enable_testing() add_subdirectory(tests) From 29a0433cbb39cd848953ccd0a238fde8708c3be9 Mon Sep 17 00:00:00 2001 From: demianmnave Date: Tue, 25 May 2021 22:13:14 -0400 Subject: [PATCH 02/15] Timed naive mxm for 4x4 C arrays --- tests/CMakeLists.txt | 15 +++++- tests/timing/CMakeLists.txt | 18 +++++++ tests/timing/cmatrix.h | 10 ++++ tests/timing/cmatrix_mxm.cpp | 19 +++++++ tests/timing/cmatrix_mxm.h | 12 +++++ tests/timing/cmatrix_mxm_timing.cpp | 77 +++++++++++++++++++++++++++++ tests/timing/timing.cpp | 51 +++++++++++++++++++ tests/timing/timing.h | 14 ++++++ 8 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 tests/timing/CMakeLists.txt create mode 100644 tests/timing/cmatrix.h create mode 100644 tests/timing/cmatrix_mxm.cpp create mode 100644 tests/timing/cmatrix_mxm.h create mode 100644 tests/timing/cmatrix_mxm_timing.cpp create mode 100644 tests/timing/timing.cpp create mode 100644 tests/timing/timing.h diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a8f8f53..e893331 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,6 +2,15 @@ # @@COPYRIGHT@@ #*------------------------------------------------------------------------- +if(MSVC AND NOT BUILD_SHARED_LIBS) + set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded$<$:Debug>) +endif() + +# Put test binraries into a convenient directory, if not already set: +if(NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +endif() + # Need the test macros: include(CMLTestMacros) @@ -27,4 +36,8 @@ add_subdirectory(quaternion) add_subdirectory(mathlib) # util tests: -add_subdirectory(util) \ No newline at end of file +add_subdirectory(util) + +#if(CML_BUILD_TIMING) + add_subdirectory(timing) +#endif() \ No newline at end of file diff --git a/tests/timing/CMakeLists.txt b/tests/timing/CMakeLists.txt new file mode 100644 index 0000000..9061865 --- /dev/null +++ b/tests/timing/CMakeLists.txt @@ -0,0 +1,18 @@ +# -*- cmake -*- ----------------------------------------------------------- +# @@COPYRIGHT@@ +#*------------------------------------------------------------------------- + +project(CML_Testing_Timing) + +macro(cml_add_timing _Name) + set(ExecName "${_Name}") + add_executable(${ExecName} ${ARGN}) + set_target_properties(${ExecName} PROPERTIES FOLDER "CML-Timing") + target_link_libraries(${ExecName} cml cml_timing) +endmacro() + +add_library(cml_timing STATIC timing.h timing.cpp) +target_include_directories(cml_timing PUBLIC ${CMAKE_CURRENT_LIST_DIR}) + +cml_add_timing(cmatrix_mxm_timing + cmatrix.h cmatrix_mxm.h cmatrix_mxm.cpp cmatrix_mxm_timing.cpp) \ No newline at end of file diff --git a/tests/timing/cmatrix.h b/tests/timing/cmatrix.h new file mode 100644 index 0000000..aa27d72 --- /dev/null +++ b/tests/timing/cmatrix.h @@ -0,0 +1,10 @@ +/* -*- C++ -*- ------------------------------------------------------------ + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ +/** @file + * @brief + */ + +#pragma once + +using matrix44d = double[4][4]; \ No newline at end of file diff --git a/tests/timing/cmatrix_mxm.cpp b/tests/timing/cmatrix_mxm.cpp new file mode 100644 index 0000000..87ff439 --- /dev/null +++ b/tests/timing/cmatrix_mxm.cpp @@ -0,0 +1,19 @@ +/* -*- C++ -*- ------------------------------------------------------------ + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ +/** @file + * @brief + */ + +#include "cmatrix_mxm.h" + +void mxm_4x4(matrix44d& res, const matrix44d& A, const matrix44d& B) +{ + for(int row = 0; row < 4; ++row) { + for(int col = 0; col < 4; ++col) { + double sum = A[row][0]*B[0][col]; + for(int k = 1; k < 4; ++k) sum += A[row][k]*B[k][col]; + res[row][col] = sum; + } + } +} \ No newline at end of file diff --git a/tests/timing/cmatrix_mxm.h b/tests/timing/cmatrix_mxm.h new file mode 100644 index 0000000..392ec5a --- /dev/null +++ b/tests/timing/cmatrix_mxm.h @@ -0,0 +1,12 @@ +/* -*- C++ -*- ------------------------------------------------------------ + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ +/** @file + * @brief + */ + +#pragma once + +#include "cmatrix.h" + +void mxm_4x4(matrix44d& res, const matrix44d& A, const matrix44d& B); \ No newline at end of file diff --git a/tests/timing/cmatrix_mxm_timing.cpp b/tests/timing/cmatrix_mxm_timing.cpp new file mode 100644 index 0000000..d495844 --- /dev/null +++ b/tests/timing/cmatrix_mxm_timing.cpp @@ -0,0 +1,77 @@ +/* -*- C++ -*- ------------------------------------------------------------ + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ +/** @file + * @brief + */ + +#include +#include +#include +#include +#include +#include +#include +#include "cmatrix_mxm.h" +#include "timing.h" + +using matrix_pair_t = std::pair; + +/* NOTE: d must return values in [0.,1.) */ +template +void uniform_random_rotation_4( + Gen& g, std::uniform_real_distribution<>& d, Matrix44T& M) +{ + cml_require(0. <= d.a() && d.a() <= 1., + std::invalid_argument, "d not in [0,1.]"); + + /* First, create a random unit quaternion (see K. Shoemake, Graphics Gems + * III): + */ + const auto two_pi = cml::constants::two_pi(); + const auto u1 = d(g), u2 = d(g), u3 = d(g); + const auto s1 = std::sqrt(1. - u1), s2 = std::sqrt(u1); + const auto t1 = two_pi * d(g), t2 = two_pi * d(g); + const auto w = std::cos(t2) * s2; + const auto x = std::sin(t1) * s1; + const auto y = std::cos(t1) * s1; + const auto z = std::sin(t2) * s2; + const auto q = cml::quaterniond(w, x, y, z).normalize(); + + cml::external44d M_out(&M[0][0]); + cml::matrix_rotation_quaternion(M_out, q); +} + + +int main(int /*argc*/, char** /*argv*/) +{ + const int N = 1; + //const int N = 1000000; + + /* Pre-generate N repeatable pairs of random 4x4 rotations: */ + const auto prep_time_start = cml::testing::usec_time(); + std::mt19937 rng(0xdeadbeef); + std::uniform_real_distribution<> d(0., std::nextafter(0., 1.)); + std::vector rotations(N); + for(int i = 0; i < N; ++ i) { + uniform_random_rotation_4(rng, d, rotations[i].first); + uniform_random_rotation_4(rng, d, rotations[i].second); + } + const auto prep_time_end = cml::testing::usec_time(); + const auto prep_time = prep_time_end - prep_time_start; + std::printf("prep time (%d pairs): %.5lf s\n", N, static_cast(prep_time)/1e6); + + /* Time N multiplications: */ + matrix44d out; + const auto mxm_time_start = cml::testing::usec_time(); + for(int i = 0; i < N; ++ i) { + + mxm_4x4(out, rotations[i].first, rotations[i].second); + + } + const auto mxm_time_end = cml::testing::usec_time(); + const auto mxm_time = mxm_time_end - mxm_time_start; + std::printf("mxm time (%d pairs): %.5lf s\n", N, static_cast(mxm_time)/1e6); + + return 0; +} \ No newline at end of file diff --git a/tests/timing/timing.cpp b/tests/timing/timing.cpp new file mode 100644 index 0000000..7dfcad0 --- /dev/null +++ b/tests/timing/timing.cpp @@ -0,0 +1,51 @@ +/* -*- C++ -*- ------------------------------------------------------------ + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ +/** @file + * @brief + */ + +#include "timing.h" + +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#define NOCRYPT +#define NOGDI +#define NOSERVICE +#define NOMCX +#define NOIME +#include +#include +#include +#else +#include +#endif + +namespace cml { +namespace testing { +#if defined(_WIN32) +usec_t usec_time() +{ + LARGE_INTEGER liC, liF; + + /* Get counts per second: */ + QueryPerformanceFrequency(&liF); + + /* Get current count: */ + QueryPerformanceCounter(&liC); + + LONGLONG freq = liF.QuadPart; + LONGLONG val = liC.QuadPart; + + /* Return total number of microseconds: */ + return (val*1000000ULL)/freq; +} +#else +usec_t usec_time() +{ + timeval tv; + gettimeofday(&tv,0); + return tv.tv_sec*1000000ULL + tv.tv_usec; +} +#endif +}} // namespace cml::testing \ No newline at end of file diff --git a/tests/timing/timing.h b/tests/timing/timing.h new file mode 100644 index 0000000..639fb45 --- /dev/null +++ b/tests/timing/timing.h @@ -0,0 +1,14 @@ +/* -*- C++ -*- ------------------------------------------------------------ + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ +/** @file + * @brief + */ + +#pragma once + +namespace cml { +namespace testing { + using usec_t = unsigned long long; + usec_t usec_time(); +}} // cml::testing \ No newline at end of file From 710ec334db425a41d64df12a019df97f335194e8 Mon Sep 17 00:00:00 2001 From: demianmnave Date: Tue, 25 May 2021 22:33:22 -0400 Subject: [PATCH 03/15] Split out uniform_random_rotation_4 --- tests/timing/CMakeLists.txt | 16 +++++++-- tests/timing/cmatrix_mxm_timing.cpp | 35 ++---------------- tests/timing/uniform_random_rotation.h | 23 ++++++++++++ tests/timing/uniform_random_rotation.tpp | 45 ++++++++++++++++++++++++ 4 files changed, 85 insertions(+), 34 deletions(-) create mode 100644 tests/timing/uniform_random_rotation.h create mode 100644 tests/timing/uniform_random_rotation.tpp diff --git a/tests/timing/CMakeLists.txt b/tests/timing/CMakeLists.txt index 9061865..23a1593 100644 --- a/tests/timing/CMakeLists.txt +++ b/tests/timing/CMakeLists.txt @@ -11,8 +11,20 @@ macro(cml_add_timing _Name) target_link_libraries(${ExecName} cml cml_timing) endmacro() -add_library(cml_timing STATIC timing.h timing.cpp) -target_include_directories(cml_timing PUBLIC ${CMAKE_CURRENT_LIST_DIR}) +set(cml_timing_INLINES uniform_random_rotation.tpp) +add_library(cml_timing STATIC + timing.h timing.cpp + uniform_random_rotation.h + ${cml_timing_INLINES} + ) +target_include_directories(cml_timing + PUBLIC ${CMAKE_CURRENT_LIST_DIR}) +set_source_files_properties(${cml_timing_INLINES} PROPERTIES + HEADER_FILE_ONLY 1) +if(${CMAKE_GENERATOR} MATCHES "Visual Studio") + set(_inlines_group "Inlines\\") + source_group("${_inlines_group}" FILES ${cml_timing_INLINES}) +endif() cml_add_timing(cmatrix_mxm_timing cmatrix.h cmatrix_mxm.h cmatrix_mxm.cpp cmatrix_mxm_timing.cpp) \ No newline at end of file diff --git a/tests/timing/cmatrix_mxm_timing.cpp b/tests/timing/cmatrix_mxm_timing.cpp index d495844..484c862 100644 --- a/tests/timing/cmatrix_mxm_timing.cpp +++ b/tests/timing/cmatrix_mxm_timing.cpp @@ -6,43 +6,14 @@ */ #include -#include #include #include -#include -#include -#include #include "cmatrix_mxm.h" #include "timing.h" +#include "uniform_random_rotation.h" using matrix_pair_t = std::pair; -/* NOTE: d must return values in [0.,1.) */ -template -void uniform_random_rotation_4( - Gen& g, std::uniform_real_distribution<>& d, Matrix44T& M) -{ - cml_require(0. <= d.a() && d.a() <= 1., - std::invalid_argument, "d not in [0,1.]"); - - /* First, create a random unit quaternion (see K. Shoemake, Graphics Gems - * III): - */ - const auto two_pi = cml::constants::two_pi(); - const auto u1 = d(g), u2 = d(g), u3 = d(g); - const auto s1 = std::sqrt(1. - u1), s2 = std::sqrt(u1); - const auto t1 = two_pi * d(g), t2 = two_pi * d(g); - const auto w = std::cos(t2) * s2; - const auto x = std::sin(t1) * s1; - const auto y = std::cos(t1) * s1; - const auto z = std::sin(t2) * s2; - const auto q = cml::quaterniond(w, x, y, z).normalize(); - - cml::external44d M_out(&M[0][0]); - cml::matrix_rotation_quaternion(M_out, q); -} - - int main(int /*argc*/, char** /*argv*/) { const int N = 1; @@ -54,8 +25,8 @@ int main(int /*argc*/, char** /*argv*/) std::uniform_real_distribution<> d(0., std::nextafter(0., 1.)); std::vector rotations(N); for(int i = 0; i < N; ++ i) { - uniform_random_rotation_4(rng, d, rotations[i].first); - uniform_random_rotation_4(rng, d, rotations[i].second); + cml::testing::uniform_random_rotation_4(rng, d, rotations[i].first); + cml::testing::uniform_random_rotation_4(rng, d, rotations[i].second); } const auto prep_time_end = cml::testing::usec_time(); const auto prep_time = prep_time_end - prep_time_start; diff --git a/tests/timing/uniform_random_rotation.h b/tests/timing/uniform_random_rotation.h new file mode 100644 index 0000000..87af7a1 --- /dev/null +++ b/tests/timing/uniform_random_rotation.h @@ -0,0 +1,23 @@ +/* -*- C++ -*- ------------------------------------------------------------ + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ +/** @file + * @brief + */ + +#pragma once + +#include + +namespace cml { +namespace testing { + +template +void uniform_random_rotation_4( + Gen& g, std::uniform_real_distribution<>& d, Matrix44T& M); + +}} // cml::testing + +#define _CML_TESTING_UNIFORM_RANDOM_ROTATION_TPP +#include "uniform_random_rotation.tpp" +#undef _CML_TESTING_UNIFORM_RANDOM_ROTATION_TPP diff --git a/tests/timing/uniform_random_rotation.tpp b/tests/timing/uniform_random_rotation.tpp new file mode 100644 index 0000000..c331010 --- /dev/null +++ b/tests/timing/uniform_random_rotation.tpp @@ -0,0 +1,45 @@ +/* -*- C++ -*- ------------------------------------------------------------ + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ +/** @file + * @brief + */ + +#ifndef _CML_TESTING_UNIFORM_RANDOM_ROTATION_TPP +#error "uniform_random_rotation.tpp not included correctly" +#endif + +#include +#include +#include + +namespace cml { +namespace testing { + +/* NOTE: d must return values in [0.,1.) */ +// TODO make this an official CML function? +template +void uniform_random_rotation_4( + Gen& g, std::uniform_real_distribution<>& d, Matrix44T& M) +{ + cml_require(0. <= d.a() && d.a() <= 1., + std::invalid_argument, "d not in [0,1.]"); + + /* First, create a random unit quaternion (see K. Shoemake, Graphics Gems + * III): + */ + const auto two_pi = cml::constants::two_pi(); + const auto u1 = d(g), u2 = d(g), u3 = d(g); + const auto s1 = std::sqrt(1. - u1), s2 = std::sqrt(u1); + const auto t1 = two_pi * d(g), t2 = two_pi * d(g); + const auto w = std::cos(t2) * s2; + const auto x = std::sin(t1) * s1; + const auto y = std::cos(t1) * s1; + const auto z = std::sin(t2) * s2; + const auto q = cml::quaterniond(w, x, y, z).normalize(); + + cml::external44d M_out(&M[0][0]); + cml::matrix_rotation_quaternion(M_out, q); +} + +}} // cml::testing From b287126232c3e27047635c447cacea73db25f0cd Mon Sep 17 00:00:00 2001 From: demianmnave Date: Tue, 25 May 2021 22:59:34 -0400 Subject: [PATCH 04/15] Split out rotation matrix pair generator --- tests/timing/CMakeLists.txt | 28 +++++++++++++++---- tests/timing/cmatrix_mxm_timing.cpp | 21 ++++---------- tests/timing/make_rotation_matrix_pairs.h | 24 ++++++++++++++++ tests/timing/make_rotation_matrix_pairs.tpp | 31 +++++++++++++++++++++ tests/timing/uniform_random_rotation.tpp | 1 + 5 files changed, 83 insertions(+), 22 deletions(-) create mode 100644 tests/timing/make_rotation_matrix_pairs.h create mode 100644 tests/timing/make_rotation_matrix_pairs.tpp diff --git a/tests/timing/CMakeLists.txt b/tests/timing/CMakeLists.txt index 23a1593..1a5b558 100644 --- a/tests/timing/CMakeLists.txt +++ b/tests/timing/CMakeLists.txt @@ -11,20 +11,36 @@ macro(cml_add_timing _Name) target_link_libraries(${ExecName} cml cml_timing) endmacro() -set(cml_timing_INLINES uniform_random_rotation.tpp) -add_library(cml_timing STATIC - timing.h timing.cpp +# Support lib: +set(cml_timing_HEADERS + timing.h uniform_random_rotation.h + make_rotation_matrix_pairs.h +) + +set(cml_timing_SOURCES + timing.cpp +) + +set(cml_timing_INLINES + uniform_random_rotation.tpp + make_rotation_matrix_pairs.tpp +) + +add_library(cml_timing STATIC + ${cml_timing_HEADERS} ${cml_timing_INLINES} + ${cml_timing_SOURCES} ) target_include_directories(cml_timing - PUBLIC ${CMAKE_CURRENT_LIST_DIR}) + PUBLIC ${CMAKE_CURRENT_LIST_DIR}) set_source_files_properties(${cml_timing_INLINES} PROPERTIES - HEADER_FILE_ONLY 1) + HEADER_FILE_ONLY 1) if(${CMAKE_GENERATOR} MATCHES "Visual Studio") set(_inlines_group "Inlines\\") source_group("${_inlines_group}" FILES ${cml_timing_INLINES}) endif() +# Timing tests: cml_add_timing(cmatrix_mxm_timing - cmatrix.h cmatrix_mxm.h cmatrix_mxm.cpp cmatrix_mxm_timing.cpp) \ No newline at end of file + cmatrix.h cmatrix_mxm.h cmatrix_mxm.cpp cmatrix_mxm_timing.cpp) \ No newline at end of file diff --git a/tests/timing/cmatrix_mxm_timing.cpp b/tests/timing/cmatrix_mxm_timing.cpp index 484c862..84a2feb 100644 --- a/tests/timing/cmatrix_mxm_timing.cpp +++ b/tests/timing/cmatrix_mxm_timing.cpp @@ -6,28 +6,19 @@ */ #include -#include -#include -#include "cmatrix_mxm.h" #include "timing.h" -#include "uniform_random_rotation.h" +#include "make_rotation_matrix_pairs.h" -using matrix_pair_t = std::pair; +#include "cmatrix_mxm.h" int main(int /*argc*/, char** /*argv*/) { - const int N = 1; - //const int N = 1000000; + // const int N = 1; + const int N = 1000000; /* Pre-generate N repeatable pairs of random 4x4 rotations: */ const auto prep_time_start = cml::testing::usec_time(); - std::mt19937 rng(0xdeadbeef); - std::uniform_real_distribution<> d(0., std::nextafter(0., 1.)); - std::vector rotations(N); - for(int i = 0; i < N; ++ i) { - cml::testing::uniform_random_rotation_4(rng, d, rotations[i].first); - cml::testing::uniform_random_rotation_4(rng, d, rotations[i].second); - } + const auto rotations = cml::testing::make_rotation_matrix_pairs(N); const auto prep_time_end = cml::testing::usec_time(); const auto prep_time = prep_time_end - prep_time_start; std::printf("prep time (%d pairs): %.5lf s\n", N, static_cast(prep_time)/1e6); @@ -36,9 +27,7 @@ int main(int /*argc*/, char** /*argv*/) matrix44d out; const auto mxm_time_start = cml::testing::usec_time(); for(int i = 0; i < N; ++ i) { - mxm_4x4(out, rotations[i].first, rotations[i].second); - } const auto mxm_time_end = cml::testing::usec_time(); const auto mxm_time = mxm_time_end - mxm_time_start; diff --git a/tests/timing/make_rotation_matrix_pairs.h b/tests/timing/make_rotation_matrix_pairs.h new file mode 100644 index 0000000..f94a9ef --- /dev/null +++ b/tests/timing/make_rotation_matrix_pairs.h @@ -0,0 +1,24 @@ +/* -*- C++ -*- ------------------------------------------------------------ + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ +/** @file + * @brief + */ + +#pragma once + +#include +#include + +namespace cml { +namespace testing { + +template +std::vector> +make_rotation_matrix_pairs(int N); + +}} // cml::testing + +#define _CML_TESTING_MAKE_ROTATION_MATRIX_PAIRS_TPP +#include "make_rotation_matrix_pairs.tpp" +#undef _CML_TESTING_MAKE_ROTATION_MATRIX_PAIRS_TPP diff --git a/tests/timing/make_rotation_matrix_pairs.tpp b/tests/timing/make_rotation_matrix_pairs.tpp new file mode 100644 index 0000000..3a3e7da --- /dev/null +++ b/tests/timing/make_rotation_matrix_pairs.tpp @@ -0,0 +1,31 @@ +/* -*- C++ -*- ------------------------------------------------------------ + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ +/** @file + * @brief + */ + +#ifndef _CML_TESTING_MAKE_ROTATION_MATRIX_PAIRS_TPP +#error "make_rotation_matrix_pairs.tpp not included correctly" +#endif + +#include "uniform_random_rotation.h" + +namespace cml { +namespace testing { + +template +std::vector> +make_rotation_matrix_pairs(int N) +{ + std::mt19937 rng(0xdeadbeef); + std::uniform_real_distribution<> d(0., std::nextafter(0., 1.)); + std::vector> rotations(N); + for(int i = 0; i < N; ++ i) { + cml::testing::uniform_random_rotation_4(rng, d, rotations[i].first); + cml::testing::uniform_random_rotation_4(rng, d, rotations[i].second); + } + return rotations; +} + +}} // cml::testing diff --git a/tests/timing/uniform_random_rotation.tpp b/tests/timing/uniform_random_rotation.tpp index c331010..6036276 100644 --- a/tests/timing/uniform_random_rotation.tpp +++ b/tests/timing/uniform_random_rotation.tpp @@ -9,6 +9,7 @@ #error "uniform_random_rotation.tpp not included correctly" #endif +#include #include #include #include From ec5766573073cdd2d75e91ec4b9261ede6a9c357 Mon Sep 17 00:00:00 2001 From: demianmnave Date: Wed, 26 May 2021 00:57:26 -0400 Subject: [PATCH 05/15] Add get(...) overloads with mutable return types --- cml/matrix/detail/get.h | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/cml/matrix/detail/get.h b/cml/matrix/detail/get.h index 1cdc862..7ed7f78 100644 --- a/cml/matrix/detail/get.h +++ b/cml/matrix/detail/get.h @@ -17,25 +17,43 @@ namespace detail { /** Helper to return the passed-in value in response to a matrix index @c * (i,j). */ -template -inline auto get(const Other& v, int, int) -> const Other& +template inline auto +get(const Other& v, int, int) +-> typename std::enable_if::value, const Other&>::type { return v; } /** Helper to return element @c (i,j) of @c array. */ -template inline const Other& +template inline auto get(Other const (&array)[Rows][Cols], int i, int j) +-> const Other& +{ + return array[i][j]; +} + +/** Helper to return element @c (i,j) of @c array. */ +template inline auto +get(Other (&array)[Rows][Cols], int i, int j) +-> Other& { return array[i][j]; } /** Helper to return element @c (i,j) of @c sub. */ -template inline auto get( - const readable_matrix& sub, int i, int j - ) -> typename matrix_traits::immutable_value +template inline auto +get( + const readable_matrix& sub, int i, int j) + -> typename matrix_traits::immutable_value +{ + return sub.get(i, j); +} + +template inline auto +get(writable_matrix& sub, int i, int j) +-> typename matrix_traits::mutable_value { - return sub.get(i,j); + return sub.get(i, j); } } // namespace detail From 07f3863115343797e649b20e6b86bf8140e0c6bd Mon Sep 17 00:00:00 2001 From: demianmnave Date: Wed, 26 May 2021 14:53:18 -0400 Subject: [PATCH 06/15] Use an array of output matrices to prevent elision of the mxm code --- tests/timing/CMakeLists.txt | 7 ++++++- tests/timing/cmatrix_mxm.h | 12 ------------ tests/timing/{cmatrix_mxm.cpp => cmatrix_mxm.inl} | 6 ++++-- tests/timing/cmatrix_mxm_timing.cpp | 11 ++++++----- 4 files changed, 16 insertions(+), 20 deletions(-) delete mode 100644 tests/timing/cmatrix_mxm.h rename tests/timing/{cmatrix_mxm.cpp => cmatrix_mxm.inl} (79%) diff --git a/tests/timing/CMakeLists.txt b/tests/timing/CMakeLists.txt index 1a5b558..4b455ee 100644 --- a/tests/timing/CMakeLists.txt +++ b/tests/timing/CMakeLists.txt @@ -43,4 +43,9 @@ endif() # Timing tests: cml_add_timing(cmatrix_mxm_timing - cmatrix.h cmatrix_mxm.h cmatrix_mxm.cpp cmatrix_mxm_timing.cpp) \ No newline at end of file + cmatrix_mxm_timing.cpp + cmatrix.h cmatrix_mxm.inl + ) +#if(MSVC) +#set_source_files_properties(cmatrix_mxm_timing.cpp PROPERTIES COMPILE_OPTIONS "/Fa\$(IntDir)") +#endif() \ No newline at end of file diff --git a/tests/timing/cmatrix_mxm.h b/tests/timing/cmatrix_mxm.h deleted file mode 100644 index 392ec5a..0000000 --- a/tests/timing/cmatrix_mxm.h +++ /dev/null @@ -1,12 +0,0 @@ -/* -*- C++ -*- ------------------------------------------------------------ - @@COPYRIGHT@@ - *-----------------------------------------------------------------------*/ -/** @file - * @brief - */ - -#pragma once - -#include "cmatrix.h" - -void mxm_4x4(matrix44d& res, const matrix44d& A, const matrix44d& B); \ No newline at end of file diff --git a/tests/timing/cmatrix_mxm.cpp b/tests/timing/cmatrix_mxm.inl similarity index 79% rename from tests/timing/cmatrix_mxm.cpp rename to tests/timing/cmatrix_mxm.inl index 87ff439..1c340cf 100644 --- a/tests/timing/cmatrix_mxm.cpp +++ b/tests/timing/cmatrix_mxm.inl @@ -5,9 +5,11 @@ * @brief */ -#include "cmatrix_mxm.h" +#pragma once -void mxm_4x4(matrix44d& res, const matrix44d& A, const matrix44d& B) +#include "cmatrix.h" + +inline void mxm_4x4(matrix44d& res, const matrix44d& A, const matrix44d& B) { for(int row = 0; row < 4; ++row) { for(int col = 0; col < 4; ++col) { diff --git a/tests/timing/cmatrix_mxm_timing.cpp b/tests/timing/cmatrix_mxm_timing.cpp index 84a2feb..844af5b 100644 --- a/tests/timing/cmatrix_mxm_timing.cpp +++ b/tests/timing/cmatrix_mxm_timing.cpp @@ -9,12 +9,12 @@ #include "timing.h" #include "make_rotation_matrix_pairs.h" -#include "cmatrix_mxm.h" +#include "cmatrix_mxm.inl" int main(int /*argc*/, char** /*argv*/) { - // const int N = 1; - const int N = 1000000; + const int N = 1; + //const int N = 1000000; /* Pre-generate N repeatable pairs of random 4x4 rotations: */ const auto prep_time_start = cml::testing::usec_time(); @@ -24,10 +24,11 @@ int main(int /*argc*/, char** /*argv*/) std::printf("prep time (%d pairs): %.5lf s\n", N, static_cast(prep_time)/1e6); /* Time N multiplications: */ - matrix44d out; + using data_t = struct { matrix44d M; }; + std::vector out(N); const auto mxm_time_start = cml::testing::usec_time(); for(int i = 0; i < N; ++ i) { - mxm_4x4(out, rotations[i].first, rotations[i].second); + mxm_4x4(out[i].M, rotations[i].first, rotations[i].second); } const auto mxm_time_end = cml::testing::usec_time(); const auto mxm_time = mxm_time_end - mxm_time_start; From fd05cb9f1a11a52e07a682e9f405722eb860500a Mon Sep 17 00:00:00 2001 From: demianmnave Date: Wed, 26 May 2021 14:54:32 -0400 Subject: [PATCH 07/15] Fix bugs in the random rotation matrix generator --- tests/timing/make_rotation_matrix_pairs.tpp | 2 +- tests/timing/uniform_random_rotation.tpp | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/timing/make_rotation_matrix_pairs.tpp b/tests/timing/make_rotation_matrix_pairs.tpp index 3a3e7da..4410c43 100644 --- a/tests/timing/make_rotation_matrix_pairs.tpp +++ b/tests/timing/make_rotation_matrix_pairs.tpp @@ -19,7 +19,7 @@ std::vector> make_rotation_matrix_pairs(int N) { std::mt19937 rng(0xdeadbeef); - std::uniform_real_distribution<> d(0., std::nextafter(0., 1.)); + std::uniform_real_distribution<> d(0., std::nextafter(1.,2.)); std::vector> rotations(N); for(int i = 0; i < N; ++ i) { cml::testing::uniform_random_rotation_4(rng, d, rotations[i].first); diff --git a/tests/timing/uniform_random_rotation.tpp b/tests/timing/uniform_random_rotation.tpp index 6036276..85a1fdb 100644 --- a/tests/timing/uniform_random_rotation.tpp +++ b/tests/timing/uniform_random_rotation.tpp @@ -29,17 +29,18 @@ void uniform_random_rotation_4( /* First, create a random unit quaternion (see K. Shoemake, Graphics Gems * III): */ - const auto two_pi = cml::constants::two_pi(); - const auto u1 = d(g), u2 = d(g), u3 = d(g); - const auto s1 = std::sqrt(1. - u1), s2 = std::sqrt(u1); - const auto t1 = two_pi * d(g), t2 = two_pi * d(g); + using scalar_type = value_type_trait_of_t; + const auto two_pi = cml::constants::two_pi(); + const auto u = d(g), u1 = d(g), u2 = d(g); + const auto s1 = std::sqrt(1. - u), s2 = std::sqrt(u); + const auto t1 = two_pi * u1, t2 = two_pi * u2; const auto w = std::cos(t2) * s2; const auto x = std::sin(t1) * s1; const auto y = std::cos(t1) * s1; const auto z = std::sin(t2) * s2; - const auto q = cml::quaterniond(w, x, y, z).normalize(); + const auto q = cml::quaterniond(x, y, z, w).normalize(); - cml::external44d M_out(&M[0][0]); + cml::external44d M_out(std::addressof(cml::detail::get(M, 0, 0))); cml::matrix_rotation_quaternion(M_out, q); } From 9047b83900f6e96ac463dfc7ab6482c1b7f6f6e8 Mon Sep 17 00:00:00 2001 From: demianmnave Date: Wed, 26 May 2021 16:04:35 -0400 Subject: [PATCH 08/15] Make the mxm implementation configurable via CMake --- tests/timing/CMakeLists.txt | 3 +++ tests/timing/cmatrix_mxm_timing.cpp | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/timing/CMakeLists.txt b/tests/timing/CMakeLists.txt index 4b455ee..911ba93 100644 --- a/tests/timing/CMakeLists.txt +++ b/tests/timing/CMakeLists.txt @@ -46,6 +46,9 @@ cml_add_timing(cmatrix_mxm_timing cmatrix_mxm_timing.cpp cmatrix.h cmatrix_mxm.inl ) +target_compile_definitions(cmatrix_mxm_timing + PRIVATE CML_TIMING_MxM_INL="cmatrix_mxm.inl" + ) #if(MSVC) #set_source_files_properties(cmatrix_mxm_timing.cpp PROPERTIES COMPILE_OPTIONS "/Fa\$(IntDir)") #endif() \ No newline at end of file diff --git a/tests/timing/cmatrix_mxm_timing.cpp b/tests/timing/cmatrix_mxm_timing.cpp index 844af5b..75bea4f 100644 --- a/tests/timing/cmatrix_mxm_timing.cpp +++ b/tests/timing/cmatrix_mxm_timing.cpp @@ -6,11 +6,13 @@ */ #include + +/* Need the matrix type and CML type system specializations: */ +#include CML_TIMING_MxM_INL + #include "timing.h" #include "make_rotation_matrix_pairs.h" -#include "cmatrix_mxm.inl" - int main(int /*argc*/, char** /*argv*/) { const int N = 1; From 106bc397bfb0961d0f115825f4af8a0cefac66e9 Mon Sep 17 00:00:00 2001 From: demianmnave Date: Wed, 26 May 2021 16:05:47 -0400 Subject: [PATCH 09/15] Add CML type system specializations for a 4x4 C-array (for the timing code only) --- tests/timing/cmatrix.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/timing/cmatrix.h b/tests/timing/cmatrix.h index aa27d72..4423e0f 100644 --- a/tests/timing/cmatrix.h +++ b/tests/timing/cmatrix.h @@ -7,4 +7,21 @@ #pragma once -using matrix44d = double[4][4]; \ No newline at end of file +#include + +using matrix44d = double[4][4]; + +/* Need some specializations for CML: */ +namespace cml { + +struct matrix44d_traits +{ + using value_type = double; +}; + +template<> struct traits_of +{ + using type = matrix44d_traits; +}; + +} // namespace cml \ No newline at end of file From 9cb9f03ad3481e8b11b900276de5f714cb30f145 Mon Sep 17 00:00:00 2001 From: demianmnave Date: Wed, 26 May 2021 16:06:08 -0400 Subject: [PATCH 10/15] Minor cleanup of common/traits.h --- cml/common/traits.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cml/common/traits.h b/cml/common/traits.h index 2bae744..e891e5d 100644 --- a/cml/common/traits.h +++ b/cml/common/traits.h @@ -25,18 +25,18 @@ template using traits_of_t = typename traits_of::type; /** Retrieve the value_type of @c T via an embedded typedef. */ template struct value_type_of { - typedef typename T::value_type type; + using type = typename T::value_type; }; -/** Convenience alias for value_typet_of. */ +/** Convenience alias for value_type_of. */ template using value_type_of_t = typename value_type_of::type; -/** Retrieve the value_type of @c T via traits. +/** Retrieve the value_type of @c T via traits. * * @note This applies to CML expression types, including scalars. */ template struct value_type_trait_of { - typedef typename traits_of::type::value_type type; + using type = typename traits_of_t::value_type; }; /** Convenience alias for value_type_trait_of. */ @@ -48,6 +48,3 @@ template } // namespace cml #endif - -// ------------------------------------------------------------------------- -// vim:ft=cpp:sw=2 From 8314e5cca00bb6936a4f604c3f9f64e2ae4813bc Mon Sep 17 00:00:00 2001 From: demianmnave Date: Wed, 26 May 2021 16:16:41 -0400 Subject: [PATCH 11/15] Complete transition to configurable timing code (for 4x4, anyway), including ET-based mxm --- tests/timing/CMakeLists.txt | 17 ++++++++++++++--- tests/timing/expr_mxm.inl | 16 ++++++++++++++++ ...rix_mxm_timing.cpp => matrix_mxm_timing.cpp} | 4 ++-- 3 files changed, 32 insertions(+), 5 deletions(-) create mode 100644 tests/timing/expr_mxm.inl rename tests/timing/{cmatrix_mxm_timing.cpp => matrix_mxm_timing.cpp} (96%) diff --git a/tests/timing/CMakeLists.txt b/tests/timing/CMakeLists.txt index 911ba93..48950e1 100644 --- a/tests/timing/CMakeLists.txt +++ b/tests/timing/CMakeLists.txt @@ -41,14 +41,25 @@ if(${CMAKE_GENERATOR} MATCHES "Visual Studio") source_group("${_inlines_group}" FILES ${cml_timing_INLINES}) endif() -# Timing tests: +# C matrix timing: cml_add_timing(cmatrix_mxm_timing - cmatrix_mxm_timing.cpp + matrix_mxm_timing.cpp cmatrix.h cmatrix_mxm.inl ) target_compile_definitions(cmatrix_mxm_timing PRIVATE CML_TIMING_MxM_INL="cmatrix_mxm.inl" ) + +# Expression template timing: +cml_add_timing(expr_mxm_timing + matrix_mxm_timing.cpp + expr_mxm.inl + ) +target_compile_definitions(expr_mxm_timing + PRIVATE CML_TIMING_MxM_INL="expr_mxm.inl" + ) + + #if(MSVC) -#set_source_files_properties(cmatrix_mxm_timing.cpp PROPERTIES COMPILE_OPTIONS "/Fa\$(IntDir)") +#set_source_files_properties(matrix_mxm_timing.cpp PROPERTIES COMPILE_OPTIONS "/Fa\$(IntDir)") #endif() \ No newline at end of file diff --git a/tests/timing/expr_mxm.inl b/tests/timing/expr_mxm.inl new file mode 100644 index 0000000..58b31cb --- /dev/null +++ b/tests/timing/expr_mxm.inl @@ -0,0 +1,16 @@ +/* -*- C++ -*- ------------------------------------------------------------ + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ +/** @file + * @brief + */ + +#pragma once + +#include + +using matrix44d = cml::matrix44d; +inline void mxm_4x4(matrix44d& res, const matrix44d& A, const matrix44d& B) +{ + res = A*B; +} \ No newline at end of file diff --git a/tests/timing/cmatrix_mxm_timing.cpp b/tests/timing/matrix_mxm_timing.cpp similarity index 96% rename from tests/timing/cmatrix_mxm_timing.cpp rename to tests/timing/matrix_mxm_timing.cpp index 75bea4f..fb18638 100644 --- a/tests/timing/cmatrix_mxm_timing.cpp +++ b/tests/timing/matrix_mxm_timing.cpp @@ -15,8 +15,8 @@ int main(int /*argc*/, char** /*argv*/) { - const int N = 1; - //const int N = 1000000; + // const int N = 1; + const int N = 1000000; /* Pre-generate N repeatable pairs of random 4x4 rotations: */ const auto prep_time_start = cml::testing::usec_time(); From 1be481eca91f0e307265f961c9810f630a144dfa Mon Sep 17 00:00:00 2001 From: demianmnave Date: Wed, 26 May 2021 16:49:12 -0400 Subject: [PATCH 12/15] Minor CMakeLists.txt cleanup to prepare for SIMD timing codes --- tests/timing/CMakeLists.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/timing/CMakeLists.txt b/tests/timing/CMakeLists.txt index 48950e1..0757bd4 100644 --- a/tests/timing/CMakeLists.txt +++ b/tests/timing/CMakeLists.txt @@ -41,22 +41,24 @@ if(${CMAKE_GENERATOR} MATCHES "Visual Studio") source_group("${_inlines_group}" FILES ${cml_timing_INLINES}) endif() -# C matrix timing: +# Naive C-array mxm timing: cml_add_timing(cmatrix_mxm_timing matrix_mxm_timing.cpp cmatrix.h cmatrix_mxm.inl ) target_compile_definitions(cmatrix_mxm_timing - PRIVATE CML_TIMING_MxM_INL="cmatrix_mxm.inl" + PRIVATE + CML_TIMING_MxM_INL="cmatrix_mxm.inl" ) -# Expression template timing: +# Naive expression template mxm timing: cml_add_timing(expr_mxm_timing matrix_mxm_timing.cpp expr_mxm.inl ) target_compile_definitions(expr_mxm_timing - PRIVATE CML_TIMING_MxM_INL="expr_mxm.inl" + PRIVATE + CML_TIMING_MxM_INL="expr_mxm.inl" ) From c9c8218c5a7138c29769e458704bf7e44dd351ea Mon Sep 17 00:00:00 2001 From: demianmnave Date: Wed, 2 Jun 2021 13:53:29 -0400 Subject: [PATCH 13/15] Add SIMD skeleton with AVX and start of SSE4.2 --- tests/timing/CMakeLists.txt | 19 +++ tests/timing/cmatrix_simd_mxm.inl | 247 ++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+) create mode 100644 tests/timing/cmatrix_simd_mxm.inl diff --git a/tests/timing/CMakeLists.txt b/tests/timing/CMakeLists.txt index 0757bd4..41e43a0 100644 --- a/tests/timing/CMakeLists.txt +++ b/tests/timing/CMakeLists.txt @@ -51,6 +51,25 @@ target_compile_definitions(cmatrix_mxm_timing CML_TIMING_MxM_INL="cmatrix_mxm.inl" ) +# Naive SIMD C-array mxm timing: +cml_add_timing(cmatrix_simd_mxm_timing + matrix_mxm_timing.cpp + cmatrix.h cmatrix_simd_mxm.inl + ) +target_compile_definitions(cmatrix_simd_mxm_timing + PRIVATE + CML_TIMING_MxM_INL="cmatrix_simd_mxm.inl" + ) +if(MSVC) + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + target_compile_options(cmatrix_simd_mxm_timing + PRIVATE + "/arch:AVX" + ) + endif() +endif() + + # Naive expression template mxm timing: cml_add_timing(expr_mxm_timing matrix_mxm_timing.cpp diff --git a/tests/timing/cmatrix_simd_mxm.inl b/tests/timing/cmatrix_simd_mxm.inl new file mode 100644 index 0000000..1aa01bc --- /dev/null +++ b/tests/timing/cmatrix_simd_mxm.inl @@ -0,0 +1,247 @@ + +/* -*- C++ -*- ------------------------------------------------------------ + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ +/** @file + * @brief + */ + +#pragma once + +#include "cmatrix.h" + +#ifdef CML_DISABLE_SIMD +# define CML_SIMD_SCALAR +#else + +#ifdef _MSC_VER +#include + +/* Assume SSE4.2 if SSE2 is supported (minimum requirement): */ +#if defined(_M_X64) +# define __SSE4_2__ 1 +#elif defined(_M_IX86_FP) +# if _M_IX86_FP >= 2 +# define __SSE4_2__ 1 +# endif +#endif + +#if defined(__AVX__) +# define CML_SIMD_AVX +#elif defined(__SSE4_2__) +# define CML_SIMD_SSE4_2 +#else +# error "SSE4.2 or higher required" +#endif + +#else +#error "Unrecognized compiler" +#endif + +#endif + + +/* Basic AVX, double, row-major & col-basis, unaligned access: */ +#if defined(CML_SIMD_AVX) +inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) +{ + /* Favor keeping B in registers: */ + __m256d B_row[4]; + for(int i = 0; i < 4; ++i) { + B_row[i] = _mm256_load_pd(reinterpret_cast(&B[i][0])); + } + for(int i = 0; i < 4; ++i) { + __m256d C_row = _mm256_mul_pd(B_row[0], _mm256_set1_pd(A[i][0])); + C_row = _mm256_add_pd(C_row, _mm256_mul_pd(B_row[1], _mm256_set1_pd(A[i][1]))); + C_row = _mm256_add_pd(C_row, _mm256_mul_pd(B_row[2], _mm256_set1_pd(A[i][2]))); + C_row = _mm256_add_pd(C_row, _mm256_mul_pd(B_row[3], _mm256_set1_pd(A[i][3]))); + _mm256_store_pd(&C[i][0], C_row); + } +} +#elif defined(CML_SIMD_SSE4_2) + +#if 0 + +/* Row major: */ +inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) +{ + + + +#if 0 + C.zero(); + + __m128d A0[2], A1[2]; + __m128d B0[2], B1[2]; + __m128d C[2]; + + /* Load upper half of B into 4 registers (2 blocks of 2x2): */ + B0[0] = _mm_loadu_pd(&B[0][0]); B0[1] = _mm_loadu_pd(&B[0][2]); + B1[0] = _mm_loadu_pd(&B[1][0]); B1[1] = _mm_loadu_pd(&B[1][2]); + + /* Load a00..a11: */ + A0[0] = _mm_set1_pd(A[0][0]); A0[1] = _mm_set1_pd(A[0][1]); + A1[0] = _mm_set1_pd(A[1][0]); A0[1] = _mm_set1_pd(A[1][1]); + + /* Load and update c00..c11: */ + C[0] = _mm_loadu_pd(&C[0][0]); + C[0] = _mm_add_pd(C[0], _mm_mul_pd(A0[0], B0[0])); + C[0] = _mm_add_pd(C[0], _mm_mul_pd(A0[1], B1[0])); + _mm_store_pd(&C[0][0], C[0]); + + C[1] = _mm_loadu_pd(&C[1][0]); + C[1] = _mm_add_pd(C[0], _mm_mul_pd(A1[0], B0[0])); + C[1] = _mm_add_pd(C[1], _mm_mul_pd(A1[1], B1[0])); + _mm_store_pd(&C[1][0], C[1]); +#endif + + + + + __m128d C0j, C1j, A0j, A1j, Bk; + for(int ii = 0; ii < 4; ii += 2) { + for(int jj = 0; jj < 4; jj += 2) { + C0j = _mm_setzero_pd(); + C1j = _mm_setzero_pd(); + + for(int kk = 0; kk < 4; kk += 2) { + A0j = _mm_set1_pd(A[ii + 0][kk + 0]); + A1j = _mm_set1_pd(A[ii + 1][kk + 0]); + Bk = _mm_loadu_pd(reinterpret_cast(&B[kk+0])); + C0j = _mm_add_pd(C0j, _mm_mul_pd(A0j, Bk)); + C1j = _mm_add_pd(C1j, _mm_mul_pd(A1j, Bk)); + +#if 0 + const auto A00 = A[ii + 0][kk + 0]; + const auto A10 = A[ii + 1][kk + 0]; + const auto B00 = B[kk + 0][jj + 0]; + const auto B01 = B[kk + 0][jj + 1]; + + C00 += A00 * B00; + C01 += A00 * B01; + C10 += A10 * B00; + C11 += A10 * B01; +#endif + + A0j = _mm_set1_pd(A[ii + 0][kk + 1]); + A1j = _mm_set1_pd(A[ii + 1][kk + 1]); + Bk = _mm_loadu_pd(reinterpret_cast(&B[kk+1])); + C0j = _mm_add_pd(C0j, _mm_mul_pd(A0j, Bk)); + C1j = _mm_add_pd(C1j, _mm_mul_pd(A1j, Bk)); + +#if 0 + const auto A01 = A[ii + 0][kk + 1]; + const auto A11 = A[ii + 1][kk + 1]; + const auto B10 = B[kk + 1][jj + 0]; + const auto B11 = B[kk + 1][jj + 1]; + + C00 += A01 * B10; + C01 += A01 * B11; + C10 += A11 * B10; + C11 += A11 * B11; +#endif + } + + _mm_store_pd(&C[ii][0], C0j); + _mm_store_pd(&C[ii][1], C1j); + } + } +} + +#elif 1 + +/* Basic blocked 2x2 using SSE2: */ +inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) +{ + __m128d C0j, C1j, A0j, A1j, Bk; + for(int ii = 0; ii < 4; ii += 2) { + for(int jj = 0; jj < 4; jj += 2) { + C0j = _mm_setzero_pd(); + C1j = _mm_setzero_pd(); + + for(int kk = 0; kk < 4; kk += 2) { + A0j = _mm_set1_pd(A[ii + 0][kk + 0]); + A1j = _mm_set1_pd(A[ii + 1][kk + 0]); + Bk = _mm_loadu_pd(reinterpret_cast(&B[kk+0])); + C0j = _mm_add_pd(C0j, _mm_mul_pd(A0j, Bk)); + C1j = _mm_add_pd(C1j, _mm_mul_pd(A1j, Bk)); + +#if 0 + const auto A00 = A[ii + 0][kk + 0]; + const auto A10 = A[ii + 1][kk + 0]; + const auto B00 = B[kk + 0][jj + 0]; + const auto B01 = B[kk + 0][jj + 1]; + + C00 += A00 * B00; + C01 += A00 * B01; + C10 += A10 * B00; + C11 += A10 * B01; +#endif + + A0j = _mm_set1_pd(A[ii + 0][kk + 1]); + A1j = _mm_set1_pd(A[ii + 1][kk + 1]); + Bk = _mm_loadu_pd(reinterpret_cast(&B[kk+1])); + C0j = _mm_add_pd(C0j, _mm_mul_pd(A0j, Bk)); + C1j = _mm_add_pd(C1j, _mm_mul_pd(A1j, Bk)); + +#if 0 + const auto A01 = A[ii + 0][kk + 1]; + const auto A11 = A[ii + 1][kk + 1]; + const auto B10 = B[kk + 1][jj + 0]; + const auto B11 = B[kk + 1][jj + 1]; + + C00 += A01 * B10; + C01 += A01 * B11; + C10 += A11 * B10; + C11 += A11 * B11; +#endif + } + + _mm_store_pd(&C[ii][0], C0j); + _mm_store_pd(&C[ii][1], C1j); + } + } +} +#elif 0 +/* Basic 2x2 blocked multiply: */ +inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) +{ + for(int ii = 0; ii < 4; ii += 2) { + for(int jj = 0; jj < 4; jj += 2) { + + auto C00 = 0., C01 = 0., C10 = 0., C11 = 0.; + for(int kk = 0; kk < 4; kk += 2) { + const auto A00 = A[ii + 0][kk + 0]; + const auto A10 = A[ii + 1][kk + 0]; + const auto B00 = B[kk + 0][jj + 0]; + const auto B01 = B[kk + 0][jj + 1]; + + C00 += A00 * B00; + C01 += A00 * B01; + C10 += A10 * B00; + C11 += A10 * B01; + + + const auto A01 = A[ii + 0][kk + 1]; + const auto A11 = A[ii + 1][kk + 1]; + const auto B10 = B[kk + 1][jj + 0]; + const auto B11 = B[kk + 1][jj + 1]; + + C00 += A01 * B10; + C01 += A01 * B11; + C10 += A11 * B10; + C11 += A11 * B11; + } + + C[ii + 0][jj + 0] = C00; + C[ii + 0][jj + 1] = C01; + C[ii + 1][jj + 0] = C10; + C[ii + 1][jj + 1] = C11; + } + } +} +#endif + +#else +#error "No SIMD intrinsics defined?" +#endif From 86b8a7ceb8bd8b2431138a9956c8c786513f4be8 Mon Sep 17 00:00:00 2001 From: demianmnave Date: Wed, 2 Jun 2021 13:54:26 -0400 Subject: [PATCH 14/15] Add iostream output for a 4x4 C-style matrix, and make N a command line parameter to matrix_mxm_timing.cpp --- tests/timing/cmatrix.h | 14 +++++++++++++- tests/timing/cmatrix_mxm.inl | 4 ++-- tests/timing/matrix_mxm_timing.cpp | 29 ++++++++++++++++++++++++----- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/tests/timing/cmatrix.h b/tests/timing/cmatrix.h index 4423e0f..6312582 100644 --- a/tests/timing/cmatrix.h +++ b/tests/timing/cmatrix.h @@ -7,6 +7,7 @@ #pragma once +#include #include using matrix44d = double[4][4]; @@ -24,4 +25,15 @@ template<> struct traits_of using type = matrix44d_traits; }; -} // namespace cml \ No newline at end of file +} // namespace cml + +inline std::ostream& operator<<(std::ostream& os, const matrix44d& M) +{ + for(int i = 0; i < 4; ++i) { + os << "["; + for(int j = 0; j < 4; ++j) os << " " << M[i][j]; + os << " ]"; + if (i != 4-1) os << std::endl; + } + return os; +} \ No newline at end of file diff --git a/tests/timing/cmatrix_mxm.inl b/tests/timing/cmatrix_mxm.inl index 1c340cf..9a10f1e 100644 --- a/tests/timing/cmatrix_mxm.inl +++ b/tests/timing/cmatrix_mxm.inl @@ -9,13 +9,13 @@ #include "cmatrix.h" -inline void mxm_4x4(matrix44d& res, const matrix44d& A, const matrix44d& B) +inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) { for(int row = 0; row < 4; ++row) { for(int col = 0; col < 4; ++col) { double sum = A[row][0]*B[0][col]; for(int k = 1; k < 4; ++k) sum += A[row][k]*B[k][col]; - res[row][col] = sum; + C[row][col] = sum; } } } \ No newline at end of file diff --git a/tests/timing/matrix_mxm_timing.cpp b/tests/timing/matrix_mxm_timing.cpp index fb18638..f324085 100644 --- a/tests/timing/matrix_mxm_timing.cpp +++ b/tests/timing/matrix_mxm_timing.cpp @@ -13,17 +13,28 @@ #include "timing.h" #include "make_rotation_matrix_pairs.h" -int main(int /*argc*/, char** /*argv*/) +int main(int argc, const char** argv) { - // const int N = 1; - const int N = 1000000; + long N = LONG_MAX; + if(argc == 2) { + char* a_end = nullptr; + N = std::strtol(argv[1], &a_end, 10); + } else { + // N = 1; + N = 10000000; + } + + if(0 >= N || N == LONG_MAX) { + std::cerr << argv[0] << ": invalid sample count " << argv[1] << std::endl; + return 1; + } /* Pre-generate N repeatable pairs of random 4x4 rotations: */ const auto prep_time_start = cml::testing::usec_time(); const auto rotations = cml::testing::make_rotation_matrix_pairs(N); const auto prep_time_end = cml::testing::usec_time(); const auto prep_time = prep_time_end - prep_time_start; - std::printf("prep time (%d pairs): %.5lf s\n", N, static_cast(prep_time)/1e6); + std::printf("prep time (%d pairs): %.5lf s\n", N, static_cast(prep_time) / 1e6); /* Time N multiplications: */ using data_t = struct { matrix44d M; }; @@ -34,7 +45,15 @@ int main(int /*argc*/, char** /*argv*/) } const auto mxm_time_end = cml::testing::usec_time(); const auto mxm_time = mxm_time_end - mxm_time_start; - std::printf("mxm time (%d pairs): %.5lf s\n", N, static_cast(mxm_time)/1e6); + std::printf("mxm time (%d pairs): %.5lf s\n", N, static_cast(mxm_time) / 1e6); + + if(N == 1) for(int i = 0; i < N; ++ i) { + std::clog << rotations[i].first << std::endl; + std::clog << rotations[i].second << std::endl; + std::clog << out[i].M << std::endl; + cml::matrix44d A(rotations[i].first), B(rotations[i].second); + std::clog << A * B << std::endl; + } return 0; } \ No newline at end of file From 34b6e4f172b517649e483f923ca9784130a6d218 Mon Sep 17 00:00:00 2001 From: Demian Nave Date: Sun, 1 Dec 2024 01:50:41 -0500 Subject: [PATCH 15/15] Implement experimental SIMD 4x4 matrix products with AVX2 - The new functions are enabled only when compiling with MSVC and "/arch:AVX2". --- cml/common/size_tags.h | 4 + cml/matrix/detail/get.h | 24 +- cml/matrix/matrix_product.h | 46 +-- cml/matrix/matrix_product.tpp | 296 ++++++++++++++--- tests/CMakeLists.txt | 83 ++--- tests/matrix/matrix_matrix_product1.cpp | 173 ++++++++-- tests/timing/CMakeLists.txt | 86 ++--- tests/timing/cmatrix.h | 14 +- tests/timing/cmatrix_mxm.inl | 22 +- tests/timing/cmatrix_simd_mxm.inl | 349 +++++++++++++++++--- tests/timing/expr_mxm.inl | 17 +- tests/timing/expr_simd_mxm.inl | 19 ++ tests/timing/make_rotation_matrix_pairs.h | 15 +- tests/timing/make_rotation_matrix_pairs.tpp | 41 ++- tests/timing/matrix_mxm_timing.cpp | 56 ++-- tests/timing/timing.cpp | 58 ++-- tests/timing/timing.h | 16 +- tests/timing/uniform_random_rotation.h | 17 +- tests/timing/uniform_random_rotation.tpp | 29 +- 19 files changed, 1003 insertions(+), 362 deletions(-) create mode 100644 tests/timing/expr_simd_mxm.inl diff --git a/cml/common/size_tags.h b/cml/common/size_tags.h index cb533ee..7a1c07e 100644 --- a/cml/common/size_tags.h +++ b/cml/common/size_tags.h @@ -78,6 +78,10 @@ template struct is_fixed_size std::is_same, fixed_size_tag>::value; }; +/** Helper to detect fixed-size types. */ +template constexpr auto is_fixed_size_v + = is_fixed_size::value; + /** Wrapper for enable_if to detect types tagged with fixed_size_tag. */ template struct enable_if_fixed_size : std::enable_if::value, T> diff --git a/cml/matrix/detail/get.h b/cml/matrix/detail/get.h index 0b13ff0..09bec60 100644 --- a/cml/matrix/detail/get.h +++ b/cml/matrix/detail/get.h @@ -4,6 +4,7 @@ #pragma once +#include #include namespace cml::detail { @@ -11,7 +12,8 @@ namespace cml::detail { /** Helper to return the passed-in value in response to a matrix index @c * (i,j). */ -template +template::value>> inline auto get(const Other& v, int, int) -> const Other& { @@ -20,17 +22,16 @@ get(const Other& v, int, int) -> const Other& /** Helper to return element @c (i,j) of @c array. */ template -inline const Other& -get(Other const (&array)[Rows][Cols], int i, int j) --> const Other& +inline auto +get(Other const (&array)[Rows][Cols], int i, int j) -> const Other& { return array[i][j]; } /** Helper to return element @c (i,j) of @c array. */ -template inline auto -get(Other (&array)[Rows][Cols], int i, int j) --> Other& +template +inline auto +get(Other (&array)[Rows][Cols], int i, int j) -> Other& { return array[i][j]; } @@ -44,11 +45,12 @@ get(const readable_matrix& sub, int i, int j) -> return sub.get(i, j); } -template inline auto -get(writable_matrix& sub, int i, int j) --> typename matrix_traits::mutable_value +template +inline auto +get(writable_matrix& sub, int i, int j) -> + typename matrix_traits::mutable_value { return sub.get(i, j); } -} +} // namespace cml::detail diff --git a/cml/matrix/matrix_product.h b/cml/matrix/matrix_product.h index 167735a..9ee96fd 100644 --- a/cml/matrix/matrix_product.h +++ b/cml/matrix/matrix_product.h @@ -1,23 +1,23 @@ -/*------------------------------------------------------------------------- - @@COPYRIGHT@@ - *-----------------------------------------------------------------------*/ - -#pragma once - -#include -#include - -namespace cml { - -/** Multiply two matrices, and return the result as a temporary. */ -template* = nullptr, - enable_if_matrix_t* = nullptr> -auto operator*(Sub1&& sub1, Sub2&& sub2) - -> matrix_inner_product_promote_t, - actual_operand_type_of_t>; - -} // namespace cml - -#define __CML_MATRIX_MATRIX_PRODUCT_TPP -#include -#undef __CML_MATRIX_MATRIX_PRODUCT_TPP +/*------------------------------------------------------------------------- + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ + +#pragma once + +#include +#include +#include + +namespace cml { + +/** Multiply two matrices and return the result as a temporary. */ +template, + typename = enable_if_matrix_t> +auto operator*(Sub1&& sub1, Sub2&& sub2) -> matrix_inner_product_promote_t< + actual_type_of_t, actual_type_of_t>; + +} // namespace cml + +#define __CML_MATRIX_MATRIX_PRODUCT_TPP +#include +#undef __CML_MATRIX_MATRIX_PRODUCT_TPP diff --git a/cml/matrix/matrix_product.tpp b/cml/matrix/matrix_product.tpp index 0e49e91..2a0eb02 100644 --- a/cml/matrix/matrix_product.tpp +++ b/cml/matrix/matrix_product.tpp @@ -1,38 +1,258 @@ -/*------------------------------------------------------------------------- - @@COPYRIGHT@@ - *-----------------------------------------------------------------------*/ - -#ifndef __CML_MATRIX_MATRIX_PRODUCT_TPP -# error "matrix/matrix_product.tpp not included correctly" -#endif - -#include - -namespace cml { - -template*, - enable_if_matrix_t*> -inline auto -operator*(Sub1&& sub1, Sub2&& sub2) - -> matrix_inner_product_promote_t, - actual_operand_type_of_t> -{ - using result_type = matrix_inner_product_promote_t< - actual_operand_type_of_t, - actual_operand_type_of_t>; - - cml::check_same_inner_size(sub1, sub2); - - result_type M; - detail::resize(M, array_rows_of(sub1), array_cols_of(sub2)); - for(int i = 0; i < M.rows(); ++i) { - for(int j = 0; j < M.cols(); ++j) { - auto m = sub1(i, 0) * sub2(0, j); - for(int k = 1; k < sub1.cols(); ++k) m += sub1(i, k) * sub2(k, j); - M(i, j) = m; - } - } - return M; -} - -} // namespace cml \ No newline at end of file +/*------------------------------------------------------------------------- + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ + +#ifndef __CML_MATRIX_MATRIX_PRODUCT_TPP +# error "matrix/matrix_product.tpp not included correctly" +#endif + +#include + +namespace cml::detail { + +/* Default matrix product. */ +template +inline auto +matrix_product(const Sub1& sub1, + const Sub2& sub2) -> matrix_inner_product_promote_t, + actual_type_of_t> +{ + using result_type = matrix_inner_product_promote_t, + actual_type_of_t>; + cml::check_same_inner_size(sub1, sub2); + + result_type M; + detail::resize(M, array_rows_of(sub1), array_cols_of(sub2)); + for(int i = 0; i < M.rows(); ++i) { + for(int j = 0; j < M.cols(); ++j) { + auto m = sub1(i, 0) * sub2(0, j); + for(int k = 1; k < sub1.cols(); ++k) m += sub1(i, k) * sub2(k, j); + M(i, j) = m; + } + } + return M; +} + +} // namespace cml::detail + +#ifndef CML_DISABLE_SIMD +# ifdef _MSC_VER +# include +# if defined(__AVX2__) +# define CML_SIMD_AVX2 +# endif +# endif +#endif + +#ifdef CML_SIMD_AVX2 +# include + +namespace cml::detail { + +/* AVX2 4x4 float row-major matrix product. */ +template +inline auto +matrix_product( + // clang-format off + const matrix, Basis1, row_major>& sub1, + const matrix, Basis2, row_major>& sub2 + // clang-format on + ) -> matrix, basis_tag_promote_t, + row_major> +{ + using result_type = matrix, + basis_tag_promote_t, row_major>; + + const auto* A = sub1.data(); + const auto* B = sub2.data(); + + /* Cache sub2: */ + const __m128 B0 = _mm_loadu_ps(B + (0 << 2)); + const __m128 B1 = _mm_loadu_ps(B + (1 << 2)); + const __m128 B2 = _mm_loadu_ps(B + (2 << 2)); + const __m128 B3 = _mm_loadu_ps(B + (3 << 2)); + + __m128 row[4]; + for(uint32_t i = 0; i < 4; ++i) { + /* row[0..3] = A[i][0] * B[0][0..3] */ + row[i] = _mm_mul_ps(_mm_broadcast_ss(A + (i << 2) + 0), B0); + + /* row[0..3] += A[i][1] * B[1][0..3] */ + row[i] = _mm_fmadd_ps(_mm_broadcast_ss(A + (i << 2) + 1), B1, row[i]); + + /* row[0..3] += A[i][2] * B[2][0..3] */ + row[i] = _mm_fmadd_ps(_mm_broadcast_ss(A + (i << 2) + 2), B2, row[i]); + + /* row[0..3] += A[i][3] * B[3][0..3] */ + row[i] = _mm_fmadd_ps(_mm_broadcast_ss(A + (i << 2) + 3), B3, row[i]); + } + + /* result[i] = row[i] */ + result_type result; + auto* M = result.data(); + for(uint32_t i = 0; i < 4; ++i) { + _mm_storeu_ps(M + (i << 2), row[i]); + } + + return result; +} + +/* AVX2 4x4 float col-major matrix product. */ +template +inline auto +matrix_product( + // clang-format off + const matrix, Basis1, col_major>& sub1, + const matrix, Basis2, col_major>& sub2 + // clang-format on + ) -> matrix, basis_tag_promote_t, + col_major> +{ + using result_type = matrix, + basis_tag_promote_t, col_major>; + + const auto* A = sub1.data(); + const auto* B = sub2.data(); + + /* Cache sub1: */ + const __m128 A0 = _mm_loadu_ps(A + (0 << 2)); + const __m128 A1 = _mm_loadu_ps(A + (1 << 2)); + const __m128 A2 = _mm_loadu_ps(A + (2 << 2)); + const __m128 A3 = _mm_loadu_ps(A + (3 << 2)); + + __m128 col[4]; + for(uint32_t j = 0; j < 4; ++j) { + /* col[0..3] = A[0..3][j] * B[j][0] */ + col[j] = _mm_mul_ps(A0, _mm_broadcast_ss(B + (j << 2) + 0)); + + /* col[0..3] += A[0..3][j] * B[j][1] */ + col[j] = _mm_fmadd_ps(A1, _mm_broadcast_ss(B + (j << 2) + 1), col[j]); + + /* col[0..3] += A[0..3][j] * B[j][2] */ + col[j] = _mm_fmadd_ps(A2, _mm_broadcast_ss(B + (j << 2) + 2), col[j]); + + /* col[0..3] += A[0..3][j] * B[j][3] */ + col[j] = _mm_fmadd_ps(A3, _mm_broadcast_ss(B + (j << 2) + 3), col[j]); + } + + /* result[j] = col[j] */ + result_type result; + auto* M = result.data(); + for(uint32_t j = 0; j < 4; ++j) { + _mm_storeu_ps(M + (j << 2), col[j]); + } + + return result; +} + +/* AVX2 4x4 double row-major matrix product. */ +template +inline auto +matrix_product( + // clang-format off + const matrix, Basis1, row_major>& sub1, + const matrix, Basis2, row_major>& sub2 + // clang-format on + ) -> matrix, basis_tag_promote_t, + row_major> +{ + using result_type = matrix, + basis_tag_promote_t, row_major>; + + const auto* A = sub1.data(); + const auto* B = sub2.data(); + + /* Cache sub2: */ + const __m256d B0 = _mm256_loadu_pd(B + (0 << 2)); + const __m256d B1 = _mm256_loadu_pd(B + (1 << 2)); + const __m256d B2 = _mm256_loadu_pd(B + (2 << 2)); + const __m256d B3 = _mm256_loadu_pd(B + (3 << 2)); + + __m256d row[4]; + for(uint32_t i = 0; i < 4; ++i) { + /* row[0..3] = A[i][0] * B[0][0..3] */ + row[i] = _mm256_mul_pd(_mm256_broadcast_sd(A + (i << 2) + 0), B0); + + /* row[0..3] += A[i][1] * B[1][0..3] */ + row[i] = _mm256_fmadd_pd(_mm256_broadcast_sd(A + (i << 2) + 1), B1, row[i]); + + /* row[0..3] += A[i][2] * B[2][0..3] */ + row[i] = _mm256_fmadd_pd(_mm256_broadcast_sd(A + (i << 2) + 2), B2, row[i]); + + /* row[0..3] += A[i][3] * B[3][0..3] */ + row[i] = _mm256_fmadd_pd(_mm256_broadcast_sd(A + (i << 2) + 3), B3, row[i]); + } + + /* result[i] = row[i] */ + result_type result; + auto* M = result.data(); + for(uint32_t i = 0; i < 4; ++i) { + _mm256_storeu_pd(M + (i << 2), row[i]); + } + + return result; +} + +/* AVX2 4x4 double col-major matrix product. */ +template +inline auto +matrix_product( + // clang-format off + const matrix, Basis1, col_major>& sub1, + const matrix, Basis2, col_major>& sub2 + // clang-format on + ) -> matrix, basis_tag_promote_t, + col_major> +{ + using result_type = matrix, + basis_tag_promote_t, col_major>; + + const auto* A = sub1.data(); + const auto* B = sub2.data(); + + /* Cache sub1: */ + const __m256d A0 = _mm256_loadu_pd(A + (0 << 2)); + const __m256d A1 = _mm256_loadu_pd(A + (1 << 2)); + const __m256d A2 = _mm256_loadu_pd(A + (2 << 2)); + const __m256d A3 = _mm256_loadu_pd(A + (3 << 2)); + + __m256d col[4]; + for(uint32_t j = 0; j < 4; ++j) { + /* col[0..3] = A[0..3][j] * B[j][0] */ + col[j] = _mm256_mul_pd(A0, _mm256_broadcast_sd(B + (j << 2) + 0)); + + /* col[0..3] += A[0..3][j] * B[j][1] */ + col[j] = _mm256_fmadd_pd(A1, _mm256_broadcast_sd(B + (j << 2) + 1), col[j]); + + /* col[0..3] += A[0..3][j] * B[j][2] */ + col[j] = _mm256_fmadd_pd(A2, _mm256_broadcast_sd(B + (j << 2) + 2), col[j]); + + /* col[0..3] += A[0..3][j] * B[j][3] */ + col[j] = _mm256_fmadd_pd(A3, _mm256_broadcast_sd(B + (j << 2) + 3), col[j]); + } + + /* result[j] = col[j] */ + result_type result; + auto* M = result.data(); + for(uint32_t j = 0; j < 4; ++j) { + _mm256_storeu_pd(M + (j << 2), col[j]); + } + + return result; +} + +} // namespace cml::detail +#endif + +namespace cml { + +template +inline auto +operator*(Sub1&& sub1, Sub2&& sub2) -> matrix_inner_product_promote_t< + actual_type_of_t, actual_type_of_t> +{ + return detail::matrix_product(std::forward(sub1), + std::forward(sub2)); +} + +} // namespace cml diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d55f6da..165a39b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,31 +2,17 @@ # @@COPYRIGHT@@ # -------------------------------------------------------------------------- -# Function to add a single-file test to the build, using ${_name}.cpp as the -# test source. -function(cml_add_test _name) - # Define the executable name: - set(ExecName "${_name}") - - # Define the test name: - if(DEFINED CML_TEST_GROUP) - set(TestName "CML:${CML_TEST_GROUP}:${_name}") - else() - message(FATAL_ERROR "CML_TEST_GROUP must be defined") - endif() - - # Setup the build target: - add_executable(${ExecName} ${_name}.cpp) - +function(cml_default_compile_options _name) # MSVC-only C++ compiler options: if(MSVC) + set(_cxx_is_msvc $) + set(_msvc_common_options /permissive- /EHsc /W4 # Enable strict warnings. ) - set(_cxx_is_msvc $) set(_msvc_compile_options /Zc:inline /Zc:strictStrings @@ -36,41 +22,62 @@ function(cml_add_test _name) /diagnostics:caret /WL /MP + /arch:AVX2 ) - set(_msvc_link_options - /NOIMPLIB - /NOEXP - ) - - target_compile_options(${ExecName} + target_compile_options(${_name} PRIVATE - ${_msvc_common_options} - $<${_cxx_is_msvc}:${_msvc_compile_options}> + ${_msvc_common_options} + $<${_cxx_is_msvc}:${_msvc_compile_options}> ) - target_compile_features(${ExecName} + target_compile_features(${_name} PRIVATE cxx_std_17 ) + endif() +endfunction() + +# Function to add a single-file executable to the build, using ${_name}.cpp as +# the test source. +function(cml_add_executable _name) + add_executable(${_name} ${ARGN}) + cml_default_compile_options(${_name}) + + if(MSVC) + set(_cxx_is_msvc $) + + set(_msvc_link_options + /NOIMPLIB + /NOEXP + ) - target_link_options(${ExecName} + target_link_options(${_name} PRIVATE $<${_cxx_is_msvc}:${_msvc_link_options}> ) + endif() - set_target_properties(${ExecName} PROPERTIES - FOLDER "cml-tests/${CML_TEST_GROUP}" - ) + get_target_property(_path ${_name} SOURCE_DIR) + get_target_property(_sources ${_name} SOURCES) + source_group(TREE "${_path}" FILES ${_sources}) +endfunction() - get_target_property(_path ${_name} SOURCE_DIR) - get_target_property(_sources ${_name} SOURCES) - source_group(TREE "${_path}" FILES ${_sources}) +# Function to add a single-file test to the build, using ${_name}.cpp as the +# test source. +function(cml_add_test _name) + if(DEFINED CML_TEST_GROUP) + set(TestName "CML:${CML_TEST_GROUP}:${_name}") + else() + message(FATAL_ERROR "CML_TEST_GROUP must be defined") endif() - target_link_libraries(${ExecName} cml cml_test_main) + cml_add_executable(${_name} ${_name}.cpp ${ARGN}) - # Setup the test: - add_test(NAME ${TestName} COMMAND ${ExecName}) + add_test(NAME ${TestName} COMMAND ${_name}) + set_target_properties(${_name} PROPERTIES + FOLDER "cml-tests/${CML_TEST_GROUP}" + ) + target_link_libraries(${_name} cml cml_test_main) endfunction() add_subdirectory(main) @@ -82,6 +89,6 @@ add_subdirectory(quaternion) add_subdirectory(mathlib) add_subdirectory(util) -#if(CML_BUILD_TIMING) - add_subdirectory(timing) +#if(BUILD_TIMING) +add_subdirectory(timing) #endif() \ No newline at end of file diff --git a/tests/matrix/matrix_matrix_product1.cpp b/tests/matrix/matrix_matrix_product1.cpp index bd75c0b..04ac983 100644 --- a/tests/matrix/matrix_matrix_product1.cpp +++ b/tests/matrix/matrix_matrix_product1.cpp @@ -2,6 +2,8 @@ @@COPYRIGHT@@ *-----------------------------------------------------------------------*/ +#include + // Make sure the main header compiles cleanly: #include @@ -9,11 +11,14 @@ #include #include #include +#include + +#include /* Testing headers: */ #include "catch_runner.h" -CATCH_TEST_CASE("fixed, product1") +CATCH_TEST_CASE("fixed-product1") { cml::matrix22d M1(1., 2., 3., 4.); cml::matrix22d M2(5., 6., 7., 8.); @@ -28,7 +33,7 @@ CATCH_TEST_CASE("fixed, product1") CATCH_CHECK(M(1, 1) == 50.); } -CATCH_TEST_CASE("fixed, product2") +CATCH_TEST_CASE("fixed-product2") { cml::matrix> M1(1., 1., 2., 2., 3., 3.); cml::matrix23d M2(1., 2., 3., 1., 2., 3.); @@ -48,7 +53,127 @@ CATCH_TEST_CASE("fixed, product2") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("fixed external, product1") +CATCH_TEST_CASE("fixed-product3") +{ + // clang-format off + cml::matrix44d M1( + 1., 1., 1., 1., + 2., 1., 1., 1., + 3., 3., 1., 1., + 4., 4., 4., 1. + ); + + cml::matrix44d M2( + 1., 4., 4., 4., + 1., 1., 3., 3., + 1., 1., 1., 2., + 1., 1., 1., 1. + ); + + cml::matrix44d X( + 4., 7., 9., 10., + 5., 11., 13., 14., + 8., 17., 23., 24., + 13., 25., 33., 37. + ); + // clang-format on + + const auto M = M1 * M2; + CATCH_REQUIRE((std::is_same::value)); + CATCH_CHECK(M == X); +} + +CATCH_TEST_CASE("fixed-product4") +{ + // clang-format off + cml::matrix44d_c M1( + 1., 1., 1., 1., + 2., 1., 1., 1., + 3., 3., 1., 1., + 4., 4., 4., 1. + ); + + cml::matrix44d_c M2( + 1., 4., 4., 4., + 1., 1., 3., 3., + 1., 1., 1., 2., + 1., 1., 1., 1. + ); + + cml::matrix44d_c X( + 4., 7., 9., 10., + 5., 11., 13., 14., + 8., 17., 23., 24., + 13., 25., 33., 37. + ); + // clang-format on + + const auto M = M1 * M2; + CATCH_REQUIRE((std::is_same::value)); + CATCH_CHECK(M == X); +} + +CATCH_TEST_CASE("fixed-product5") +{ + // clang-format off + cml::matrix44f M1( + 1.f, 1.f, 1.f, 1.f, + 2.f, 1.f, 1.f, 1.f, + 3.f, 3.f, 1.f, 1.f, + 4.f, 4.f, 4.f, 1.f + ); + + cml::matrix44f M2( + 1.f, 4.f, 4.f, 4.f, + 1.f, 1.f, 3.f, 3.f, + 1.f, 1.f, 1.f, 2.f, + 1.f, 1.f, 1.f, 1.f + ); + + cml::matrix44f X( + 4.f, 7.f, 9.f, 10.f, + 5.f, 11.f, 13.f, 14.f, + 8.f, 17.f, 23.f, 24.f, + 13.f, 25.f, 33.f, 37.f + ); + // clang-format on + + const auto M = M1 * M2; + CATCH_REQUIRE((std::is_same::value)); + CATCH_CHECK(M == X); +} + +CATCH_TEST_CASE("fixed-product6") +{ + // clang-format off + cml::matrix44f_c M1( + 1.f, 1.f, 1.f, 1.f, + 2.f, 1.f, 1.f, 1.f, + 3.f, 3.f, 1.f, 1.f, + 4.f, 4.f, 4.f, 1.f + ); + + cml::matrix44f_c M2( + 1.f, 4.f, 4.f, 4.f, + 1.f, 1.f, 3.f, 3.f, + 1.f, 1.f, 1.f, 2.f, + 1.f, 1.f, 1.f, 1.f + ); + + cml::matrix44f_c X( + 4.f, 7.f, 9.f, 10.f, + 5.f, 11.f, 13.f, 14.f, + 8.f, 17.f, 23.f, 24.f, + 13.f, 25.f, 33.f, 37.f + ); + // clang-format on + + const auto M = M1 * M2; + CATCH_REQUIRE((std::is_same::value)); + CATCH_CHECK(M == X); +} + +CATCH_TEST_CASE("fixed external-product1") { double aM1[] = {1., 2., 3., 4.}; cml::external22d M1(aM1); @@ -66,7 +191,7 @@ CATCH_TEST_CASE("fixed external, product1") CATCH_CHECK(M(1, 1) == 50.); } -CATCH_TEST_CASE("fixed external, product2") +CATCH_TEST_CASE("fixed external-product2") { double aM1[] = {1., 1., 2., 2., 3., 3.}; cml::matrix> M1(aM1); @@ -89,7 +214,7 @@ CATCH_TEST_CASE("fixed external, product2") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("dynamic external, product1") +CATCH_TEST_CASE("dynamic external-product1") { double aM1[] = {1., 2., 3., 4.}; cml::externalmnd M1(2, 2, aM1); @@ -107,7 +232,7 @@ CATCH_TEST_CASE("dynamic external, product1") CATCH_CHECK(M(1, 1) == 50.); } -CATCH_TEST_CASE("dynamic external, product2") +CATCH_TEST_CASE("dynamic external-product2") { double aM1[] = {1., 1., 2., 2., 3., 3.}; cml::externalmnd M1(3, 2, aM1); @@ -130,7 +255,7 @@ CATCH_TEST_CASE("dynamic external, product2") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("dynamic external, size_checking1") +CATCH_TEST_CASE("dynamic external-size_checking1") { double aM1[4], aM2[6]; CATCH_REQUIRE_THROWS_AS( @@ -138,7 +263,7 @@ CATCH_TEST_CASE("dynamic external, size_checking1") cml::incompatible_matrix_inner_size_error); } -CATCH_TEST_CASE("dynamic, product1") +CATCH_TEST_CASE("dynamic-product1") { cml::matrixd M1(2, 2, 1., 2., 3., 4.); cml::matrixd M2(2, 2, 5., 6., 7., 8.); @@ -153,7 +278,7 @@ CATCH_TEST_CASE("dynamic, product1") CATCH_CHECK(M(1, 1) == 50.); } -CATCH_TEST_CASE("dynamic, product2") +CATCH_TEST_CASE("dynamic-product2") { cml::matrixd M1(3, 2, 1., 1., 2., 2., 3., 3.); cml::matrixd M2(2, 3, 1., 2., 3., 1., 2., 3.); @@ -173,13 +298,13 @@ CATCH_TEST_CASE("dynamic, product2") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("dynamic, size_checking1") +CATCH_TEST_CASE("dynamic-size_checking1") { CATCH_REQUIRE_THROWS_AS((cml::matrixd(2, 2) * cml::matrixd(3, 2)), cml::incompatible_matrix_inner_size_error); } -CATCH_TEST_CASE("mixed fixed, dynamic1") +CATCH_TEST_CASE("mixed fixed-dynamic1") { cml::matrix> M1(1., 1., 2., 2., 3., 3.); cml::matrixd M2(2, 3, 1., 2., 3., 1., 2., 3.); @@ -199,7 +324,7 @@ CATCH_TEST_CASE("mixed fixed, dynamic1") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed fixed, external1") +CATCH_TEST_CASE("mixed fixed-external1") { cml::matrix> M1(1., 1., 2., 2., 3., 3.); @@ -221,7 +346,7 @@ CATCH_TEST_CASE("mixed fixed, external1") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed fixed, external2") +CATCH_TEST_CASE("mixed fixed-external2") { cml::matrix> M1(1., 1., 2., 2., 3., 3.); @@ -243,7 +368,7 @@ CATCH_TEST_CASE("mixed fixed, external2") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed dynamic, fixed1") +CATCH_TEST_CASE("mixed dynamic-fixed1") { cml::matrixd M1(3, 2, 1., 1., 2., 2., 3., 3.); cml::matrix23d M2(1., 2., 3., 1., 2., 3.); @@ -263,7 +388,7 @@ CATCH_TEST_CASE("mixed dynamic, fixed1") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed dynamic, external1") +CATCH_TEST_CASE("mixed dynamic-external1") { cml::matrixd M1(3, 2, 1., 1., 2., 2., 3., 3.); @@ -285,7 +410,7 @@ CATCH_TEST_CASE("mixed dynamic, external1") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed dynamic, external2") +CATCH_TEST_CASE("mixed dynamic-external2") { cml::matrixd M1(3, 2, 1., 1., 2., 2., 3., 3.); @@ -307,7 +432,7 @@ CATCH_TEST_CASE("mixed dynamic, external2") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed fixed external, fixed1") +CATCH_TEST_CASE("mixed fixed external-fixed1") { double aM1[] = {1., 1., 2., 2., 3., 3.}; cml::matrix> M1(aM1); @@ -329,7 +454,7 @@ CATCH_TEST_CASE("mixed fixed external, fixed1") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed fixed external, dynamic1") +CATCH_TEST_CASE("mixed fixed external-dynamic1") { double aM1[] = {1., 1., 2., 2., 3., 3.}; cml::matrix> M1(aM1); @@ -351,7 +476,7 @@ CATCH_TEST_CASE("mixed fixed external, dynamic1") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed fixed external, external1") +CATCH_TEST_CASE("mixed fixed external-external1") { double aM1[] = {1., 1., 2., 2., 3., 3.}; cml::matrix> M1(aM1); @@ -374,7 +499,7 @@ CATCH_TEST_CASE("mixed fixed external, external1") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed fixed external, external2") +CATCH_TEST_CASE("mixed fixed external-external2") { double aM1[] = {1., 1., 2., 2., 3., 3.}; cml::matrix> M1(aM1); @@ -397,7 +522,7 @@ CATCH_TEST_CASE("mixed fixed external, external2") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed dynamic external, fixed1") +CATCH_TEST_CASE("mixed dynamic external-fixed1") { double aM1[] = {1., 1., 2., 2., 3., 3.}; cml::matrix> M1(3, 2, aM1); @@ -419,7 +544,7 @@ CATCH_TEST_CASE("mixed dynamic external, fixed1") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed dynamic external, dynamic1") +CATCH_TEST_CASE("mixed dynamic external-dynamic1") { double aM1[] = {1., 1., 2., 2., 3., 3.}; cml::matrix> M1(3, 2, aM1); @@ -441,7 +566,7 @@ CATCH_TEST_CASE("mixed dynamic external, dynamic1") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed dynamic external, external1") +CATCH_TEST_CASE("mixed dynamic external-external1") { double aM1[] = {1., 1., 2., 2., 3., 3.}; cml::matrix> M1(3, 2, aM1); @@ -464,7 +589,7 @@ CATCH_TEST_CASE("mixed dynamic external, external1") CATCH_CHECK(M(2, 2) == 18.); } -CATCH_TEST_CASE("mixed dynamic external, external2") +CATCH_TEST_CASE("mixed dynamic external-external2") { double aM1[] = {1., 1., 2., 2., 3., 3.}; cml::matrix> M1(3, 2, aM1); diff --git a/tests/timing/CMakeLists.txt b/tests/timing/CMakeLists.txt index 41e43a0..b687c63 100644 --- a/tests/timing/CMakeLists.txt +++ b/tests/timing/CMakeLists.txt @@ -1,13 +1,13 @@ -# -*- cmake -*- ----------------------------------------------------------- +#*------------------------------------------------------------------------- # @@COPYRIGHT@@ #*------------------------------------------------------------------------- project(CML_Testing_Timing) -macro(cml_add_timing _Name) - set(ExecName "${_Name}") - add_executable(${ExecName} ${ARGN}) - set_target_properties(${ExecName} PROPERTIES FOLDER "CML-Timing") +macro(cml_add_timing _name) + set(ExecName "${_name}") + cml_add_executable(${_name} ${ARGN}) + set_target_properties(${ExecName} PROPERTIES FOLDER "cml-tests/timing") target_link_libraries(${ExecName} cml cml_timing) endmacro() @@ -31,56 +31,66 @@ add_library(cml_timing STATIC ${cml_timing_HEADERS} ${cml_timing_INLINES} ${cml_timing_SOURCES} - ) +) +cml_default_compile_options(cml_timing) target_include_directories(cml_timing PUBLIC ${CMAKE_CURRENT_LIST_DIR}) +target_link_libraries(cml_timing PUBLIC cml) set_source_files_properties(${cml_timing_INLINES} PROPERTIES HEADER_FILE_ONLY 1) -if(${CMAKE_GENERATOR} MATCHES "Visual Studio") - set(_inlines_group "Inlines\\") - source_group("${_inlines_group}" FILES ${cml_timing_INLINES}) -endif() +set_target_properties(cml_timing PROPERTIES FOLDER "cml-tests/timing") +get_target_property(_path cml_timing SOURCE_DIR) +get_target_property(_sources cml_timing SOURCES) +source_group(TREE "${_path}" FILES ${_sources}) # Naive C-array mxm timing: cml_add_timing(cmatrix_mxm_timing matrix_mxm_timing.cpp - cmatrix.h cmatrix_mxm.inl - ) + cmatrix.h cmatrix_mxm.inl +) target_compile_definitions(cmatrix_mxm_timing - PRIVATE + PRIVATE CML_TIMING_MxM_INL="cmatrix_mxm.inl" - ) +) + +# Naive expression template mxm timing: +cml_add_timing(expr_mxm_timing + matrix_mxm_timing.cpp + expr_mxm.inl +) +target_compile_definitions(expr_mxm_timing + PRIVATE + CML_TIMING_MxM_INL="expr_mxm.inl" +) -# Naive SIMD C-array mxm timing: +# SIMD C-array mxm timing: cml_add_timing(cmatrix_simd_mxm_timing matrix_mxm_timing.cpp - cmatrix.h cmatrix_simd_mxm.inl - ) + cmatrix.h cmatrix_simd_mxm.inl +) target_compile_definitions(cmatrix_simd_mxm_timing - PRIVATE + PRIVATE CML_TIMING_MxM_INL="cmatrix_simd_mxm.inl" - ) +) if(MSVC) - if(CMAKE_SIZEOF_VOID_P EQUAL 8) - target_compile_options(cmatrix_simd_mxm_timing - PRIVATE - "/arch:AVX" - ) - endif() + target_compile_options(cmatrix_simd_mxm_timing + PRIVATE + "/arch:AVX2" + ) endif() - -# Naive expression template mxm timing: -cml_add_timing(expr_mxm_timing +# SIMD expression mxm timing: +cml_add_timing(expr_simd_mxm_timing matrix_mxm_timing.cpp - expr_mxm.inl + expr_simd_mxm.inl +) +target_compile_definitions(expr_simd_mxm_timing + PRIVATE + CML_TIMING_MxM_INL="expr_simd_mxm.inl" +) +if(MSVC) + target_compile_options(expr_simd_mxm_timing + PRIVATE + "/arch:AVX2" ) -target_compile_definitions(expr_mxm_timing - PRIVATE - CML_TIMING_MxM_INL="expr_mxm.inl" - ) - - -#if(MSVC) -#set_source_files_properties(matrix_mxm_timing.cpp PROPERTIES COMPILE_OPTIONS "/Fa\$(IntDir)") -#endif() \ No newline at end of file +endif() diff --git a/tests/timing/cmatrix.h b/tests/timing/cmatrix.h index 6312582..5caef13 100644 --- a/tests/timing/cmatrix.h +++ b/tests/timing/cmatrix.h @@ -1,9 +1,6 @@ -/* -*- C++ -*- ------------------------------------------------------------ +/*------------------------------------------------------------------------- @@COPYRIGHT@@ *-----------------------------------------------------------------------*/ -/** @file - * @brief - */ #pragma once @@ -25,15 +22,16 @@ template<> struct traits_of using type = matrix44d_traits; }; -} // namespace cml +} // namespace cml -inline std::ostream& operator<<(std::ostream& os, const matrix44d& M) +inline std::ostream& +operator<<(std::ostream& os, const matrix44d& M) { for(int i = 0; i < 4; ++i) { os << "["; for(int j = 0; j < 4; ++j) os << " " << M[i][j]; os << " ]"; - if (i != 4-1) os << std::endl; + if(i != 4 - 1) os << std::endl; } return os; -} \ No newline at end of file +} diff --git a/tests/timing/cmatrix_mxm.inl b/tests/timing/cmatrix_mxm.inl index 9a10f1e..1f64677 100644 --- a/tests/timing/cmatrix_mxm.inl +++ b/tests/timing/cmatrix_mxm.inl @@ -1,21 +1,19 @@ -/* -*- C++ -*- ------------------------------------------------------------ +/*------------------------------------------------------------------------- @@COPYRIGHT@@ *-----------------------------------------------------------------------*/ -/** @file - * @brief - */ #pragma once #include "cmatrix.h" -inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) +inline void +mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) { - for(int row = 0; row < 4; ++row) { - for(int col = 0; col < 4; ++col) { - double sum = A[row][0]*B[0][col]; - for(int k = 1; k < 4; ++k) sum += A[row][k]*B[k][col]; - C[row][col] = sum; - } + for(int row = 0; row < 4; ++row) { + for(int col = 0; col < 4; ++col) { + auto sum = 0.; + for(int k = 0; k < 4; ++k) sum += A[row][k] * B[k][col]; + C[row][col] = sum; } -} \ No newline at end of file + } +} diff --git a/tests/timing/cmatrix_simd_mxm.inl b/tests/timing/cmatrix_simd_mxm.inl index 1aa01bc..cc39941 100644 --- a/tests/timing/cmatrix_simd_mxm.inl +++ b/tests/timing/cmatrix_simd_mxm.inl @@ -1,74 +1,316 @@ -/* -*- C++ -*- ------------------------------------------------------------ +/*------------------------------------------------------------------------- @@COPYRIGHT@@ *-----------------------------------------------------------------------*/ -/** @file - * @brief - */ #pragma once -#include "cmatrix.h" - #ifdef CML_DISABLE_SIMD -# define CML_SIMD_SCALAR +# define CML_SIMD_SCALAR #else -#ifdef _MSC_VER -#include +# ifdef _MSC_VER +# include /* Assume SSE4.2 if SSE2 is supported (minimum requirement): */ -#if defined(_M_X64) -# define __SSE4_2__ 1 -#elif defined(_M_IX86_FP) -# if _M_IX86_FP >= 2 -# define __SSE4_2__ 1 -# endif -#endif +# if defined(_M_X64) +# define __SSE4_2__ 1 +# elif defined(_M_IX86_FP) +# if _M_IX86_FP >= 2 +# define __SSE4_2__ 1 +# endif +# endif + +# if defined(__AVX2__) +# define CML_SIMD_AVX2 +# elif defined(__SSE4_2__) +# define CML_SIMD_SSE4_2 +# else +# error "SSE4.2 or higher required" +# endif + +# else +# error "Unrecognized compiler" +# endif -#if defined(__AVX__) -# define CML_SIMD_AVX -#elif defined(__SSE4_2__) -# define CML_SIMD_SSE4_2 -#else -# error "SSE4.2 or higher required" #endif -#else -#error "Unrecognized compiler" -#endif +#include +#include + +#include "cmatrix.h" + +#if 0 +struct matrix44d +{ + double& operator()(int i, int j) { return d[i][j]; } + + union + { + double d[4][4]; + __m256d m[4]; + }; +}; + +/* Need some specializations for CML: */ +namespace cml { + +struct matrix44d_traits +{ + using value_type = double; +}; + +template<> struct traits_of +{ + using type = matrix44d_traits; +}; + +namespace detail { + +inline double& +get(::matrix44d& m, int i, int j) +{ + return m.d[i][j]; +} +inline const double& +get(const ::matrix44d& m, int i, int j) +{ + return m.d[i][j]; +} + +} // namespace detail +} // namespace cml + +inline std::ostream& +operator<<(std::ostream& os, const matrix44d& M) +{ + for(int i = 0; i < 4; ++i) { + os << "["; + for(int j = 0; j < 4; ++j) os << " " << M.d[i][j]; + os << " ]"; + if(i != 4 - 1) os << std::endl; + } + return os; +} #endif +// References: +// https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/intrinsics.html -/* Basic AVX, double, row-major & col-basis, unaligned access: */ -#if defined(CML_SIMD_AVX) -inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) +/* Basic AVX2, double, row-major & col-basis: */ +#if defined(CML_SIMD_AVX2) +inline void +mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) { +# if 0 + const __m256d B0 = _mm256_loadu_pd(&B[0][0]); + const __m256d B1 = _mm256_loadu_pd(&B[1][0]); + const __m256d B2 = _mm256_loadu_pd(&B[2][0]); + const __m256d B3 = _mm256_loadu_pd(&B[3][0]); + + __m256d C0 = _mm256_mul_pd(_mm256_broadcast_sd(&A[0][0]), B0); + C0 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[0][1]), B1, C0); + C0 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[0][2]), B2, C0); + C0 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[0][3]), B3, C0); + + __m256d C1 = _mm256_mul_pd(_mm256_broadcast_sd(&A[1][0]), B0); + C1 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[1][1]), B1, C1); + C1 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[1][2]), B2, C1); + C1 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[1][3]), B3, C1); + + __m256d C2 = _mm256_mul_pd(_mm256_broadcast_sd(&A[2][0]), B0); + C2 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[2][1]), B1, C2); + C2 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[2][2]), B2, C2); + C2 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[2][3]), B3, C2); + + __m256d C3 = _mm256_mul_pd(_mm256_broadcast_sd(&A[3][0]), B0); + C3 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[3][1]), B1, C3); + C3 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[3][2]), B2, C3); + C3 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[3][3]), B3, C3); + + _mm256_storeu_pd(&C[0][0], C0); + _mm256_storeu_pd(&C[1][0], C1); + _mm256_storeu_pd(&C[2][0], C2); + _mm256_storeu_pd(&C[3][0], C3); +# endif + +# if 1 + const __m256d B0 = _mm256_loadu_pd(&B[0][0]); + const __m256d B1 = _mm256_loadu_pd(&B[1][0]); + const __m256d B2 = _mm256_loadu_pd(&B[2][0]); + const __m256d B3 = _mm256_loadu_pd(&B[3][0]); + + __m256d C_row[4]; + for(int i = 0; i < 4; ++i) { + /* row[0..3] = A[i][0] * B[0][0..3] */ + C_row[i] = _mm256_mul_pd(_mm256_broadcast_sd(&A[i][0]), B0); + + /* row[0..3] += A[i][1] * B[1][0..3] */ + C_row[i] = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[i][1]), B1, C_row[i]); + + /* row[0..3] += A[i][2] * B[2][0..3] */ + C_row[i] = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[i][2]), B2, C_row[i]); + + /* row[0..3] += A[i][3] * B[3][0..3] */ + C_row[i] = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[i][3]), B3, C_row[i]); + } + + for(int i = 0; i < 4; ++i) { + /* C[i][0...3] = row[0..3] */ + _mm256_storeu_pd(&C[i][0], C_row[i]); + } +# endif + +# if 0 + /* Favor keeping B in registers, use AVX2 FMA: */ + __m256d B_row[4]; + for(int i = 0; i < 4; ++i) { + B_row[i] = _mm256_loadu_pd(&B[i][0]); + } + for(int i = 0; i < 4; ++i) { + /* row[0..3] = A[i][0] * B[0][0..3] */ + __m256d C_row = _mm256_mul_pd(B_row[0], _mm256_broadcast_sd(&A[i][0])); + + /* row[0..3] += A[i][1] * B[1][0..3] */ + C_row = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[i][1]), B_row[1], C_row); + + /* row[0..3] += A[i][2] * B[2][0..3] */ + C_row = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[i][2]), B_row[2], C_row); + + /* row[0..3] += A[i][3] * B[3][0..3] */ + C_row = _mm256_fmadd_pd(_mm256_broadcast_sd(&A[i][3]), B_row[3], C_row); + + /* C[i][0...3] = row[0..3] */ + _mm256_storeu_pd(&C[i][0], C_row); + } +# endif + +# if 0 /* Favor keeping B in registers: */ __m256d B_row[4]; for(int i = 0; i < 4; ++i) { B_row[i] = _mm256_load_pd(reinterpret_cast(&B[i][0])); } for(int i = 0; i < 4; ++i) { - __m256d C_row = _mm256_mul_pd(B_row[0], _mm256_set1_pd(A[i][0])); - C_row = _mm256_add_pd(C_row, _mm256_mul_pd(B_row[1], _mm256_set1_pd(A[i][1]))); - C_row = _mm256_add_pd(C_row, _mm256_mul_pd(B_row[2], _mm256_set1_pd(A[i][2]))); - C_row = _mm256_add_pd(C_row, _mm256_mul_pd(B_row[3], _mm256_set1_pd(A[i][3]))); + __m256d C_row = _mm256_mul_pd(B_row[0], _mm256_set1_pd(A[i][0])); + C_row = + _mm256_add_pd(C_row, _mm256_mul_pd(B_row[1], _mm256_set1_pd(A[i][1]))); + C_row = + _mm256_add_pd(C_row, _mm256_mul_pd(B_row[2], _mm256_set1_pd(A[i][2]))); + C_row = + _mm256_add_pd(C_row, _mm256_mul_pd(B_row[3], _mm256_set1_pd(A[i][3]))); + _mm256_store_pd(&C[i][0], C_row); + } +# endif + +# if 0 + // Note: much slower than code above. + // https://github.com/romz-pl/matrix-matrix-multiply/blob/main/src/dgemm_avx256.cpp + for(uint32_t i = 0; i < 4; ++i) { + for(uint32_t j = 0; j < 4; ++j) { + __m256d c0 = _mm256_set_pd(0., 0., 0., 0.); + for(uint32_t k = 0; k < 4; k++) { + c0 = _mm256_add_pd(c0, /* c0 += A[i][k]*B[k][j] */ + _mm256_mul_pd(_mm256_load_pd(&A[i][k]), + _mm256_broadcast_sd(&B[k][j]))); + } + _mm256_store_pd(&C[i][j], c0); /* C[i][j] = c0 */ + } + } + +# endif + +# if 0 + // Note: broadcast_sd and set1_pd seem to have similar performance. + /* Favor keeping B in registers (broadcast version): */ + __m256d B_row[4]; + for(int i = 0; i < 4; ++i) { + B_row[i] = _mm256_load_pd(&B[i][0]); + } + for(int i = 0; i < 4; ++i) { + __m256d C_row = _mm256_mul_pd(B_row[0], _mm256_broadcast_sd(&A[i][0])); + C_row = + _mm256_add_pd(C_row, _mm256_mul_pd(B_row[1], _mm256_broadcast_sd(&A[i][1]))); + C_row = + _mm256_add_pd(C_row, _mm256_mul_pd(B_row[2], _mm256_broadcast_sd(&A[i][2]))); + C_row = + _mm256_add_pd(C_row, _mm256_mul_pd(B_row[3], _mm256_broadcast_sd(&A[i][3]))); + _mm256_store_pd(&C[i][0], C_row); + } +# endif + +# if 0 + // Much slower than caching B in registers. + /* Load B, use AVX2 FMA: */ + for(int i = 0; i < 4; ++i) { + __m256d C_row = _mm256_mul_pd( + _mm256_set1_pd(A[i][0]), + _mm256_load_pd(&B[0][0]) + ); + C_row = _mm256_fmadd_pd( + _mm256_set1_pd(A[i][1]), + _mm256_load_pd(&B[1][0]), + C_row); + C_row = _mm256_fmadd_pd( + _mm256_set1_pd(A[i][2]), + _mm256_load_pd(&B[2][0]), + C_row); + C_row = _mm256_fmadd_pd( + _mm256_set1_pd(A[i][3]), + _mm256_load_pd(&B[3][0]), + C_row); _mm256_store_pd(&C[i][0], C_row); } +# endif + +# if 0 + __m256d C_row0 = _mm256_mul_pd(_mm256_broadcast_sd(&A.d[0][0]), B.m[0]); + C_row0 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[0][1]), B.m[1], C_row0); + C_row0 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[0][2]), B.m[2], C_row0); + C_row0 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[0][3]), B.m[3], C_row0); + + __m256d C_row1 = _mm256_mul_pd(_mm256_broadcast_sd(&A.d[1][0]), B.m[0]); + C_row1 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[1][1]), B.m[1], C_row1); + C_row1 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[1][2]), B.m[2], C_row1); + C_row1 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[1][3]), B.m[3], C_row1); + + __m256d C_row2 = _mm256_mul_pd(_mm256_broadcast_sd(&A.d[2][0]), B.m[0]); + C_row2 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[2][1]), B.m[1], C_row2); + C_row2 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[2][2]), B.m[2], C_row2); + C_row2 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[2][3]), B.m[3], C_row2); + + __m256d C_row3 = _mm256_mul_pd(_mm256_broadcast_sd(&A.d[3][0]), B.m[0]); + C_row3 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[3][1]), B.m[1], C_row3); + C_row3 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[3][2]), B.m[2], C_row3); + C_row3 = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[3][3]), B.m[3], C_row3); + + C.m[0] = C_row0; + C.m[1] = C_row1; + C.m[2] = C_row2; + C.m[3] = C_row3; +# endif + +# if 0 + for(int i = 0; i < 4; ++i) { + __m256d C_row = _mm256_mul_pd(_mm256_broadcast_sd(&A.d[i][0]), B.m[0]); + C_row = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[i][1]), B.m[1], C_row); + C_row = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[i][2]), B.m[2], C_row); + C_row = _mm256_fmadd_pd(_mm256_broadcast_sd(&A.d[i][3]), B.m[3], C_row); + C.m[i] = C_row; + } +# endif } #elif defined(CML_SIMD_SSE4_2) -#if 0 +# if 0 /* Row major: */ inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) { - -#if 0 +# if 0 C.zero(); __m128d A0[2], A1[2]; @@ -93,7 +335,7 @@ inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) C[1] = _mm_add_pd(C[0], _mm_mul_pd(A1[0], B0[0])); C[1] = _mm_add_pd(C[1], _mm_mul_pd(A1[1], B1[0])); _mm_store_pd(&C[1][0], C[1]); -#endif +# endif @@ -111,7 +353,7 @@ inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) C0j = _mm_add_pd(C0j, _mm_mul_pd(A0j, Bk)); C1j = _mm_add_pd(C1j, _mm_mul_pd(A1j, Bk)); -#if 0 +# if 0 const auto A00 = A[ii + 0][kk + 0]; const auto A10 = A[ii + 1][kk + 0]; const auto B00 = B[kk + 0][jj + 0]; @@ -121,7 +363,7 @@ inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) C01 += A00 * B01; C10 += A10 * B00; C11 += A10 * B01; -#endif +# endif A0j = _mm_set1_pd(A[ii + 0][kk + 1]); A1j = _mm_set1_pd(A[ii + 1][kk + 1]); @@ -129,7 +371,7 @@ inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) C0j = _mm_add_pd(C0j, _mm_mul_pd(A0j, Bk)); C1j = _mm_add_pd(C1j, _mm_mul_pd(A1j, Bk)); -#if 0 +# if 0 const auto A01 = A[ii + 0][kk + 1]; const auto A11 = A[ii + 1][kk + 1]; const auto B10 = B[kk + 1][jj + 0]; @@ -139,7 +381,7 @@ inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) C01 += A01 * B11; C10 += A11 * B10; C11 += A11 * B11; -#endif +# endif } _mm_store_pd(&C[ii][0], C0j); @@ -148,10 +390,11 @@ inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) } } -#elif 1 +# elif 1 /* Basic blocked 2x2 using SSE2: */ -inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) +inline void +mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) { __m128d C0j, C1j, A0j, A1j, Bk; for(int ii = 0; ii < 4; ii += 2) { @@ -162,11 +405,11 @@ inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) for(int kk = 0; kk < 4; kk += 2) { A0j = _mm_set1_pd(A[ii + 0][kk + 0]); A1j = _mm_set1_pd(A[ii + 1][kk + 0]); - Bk = _mm_loadu_pd(reinterpret_cast(&B[kk+0])); + Bk = _mm_loadu_pd(reinterpret_cast(&B[kk + 0])); C0j = _mm_add_pd(C0j, _mm_mul_pd(A0j, Bk)); C1j = _mm_add_pd(C1j, _mm_mul_pd(A1j, Bk)); -#if 0 +# if 0 const auto A00 = A[ii + 0][kk + 0]; const auto A10 = A[ii + 1][kk + 0]; const auto B00 = B[kk + 0][jj + 0]; @@ -176,15 +419,15 @@ inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) C01 += A00 * B01; C10 += A10 * B00; C11 += A10 * B01; -#endif +# endif A0j = _mm_set1_pd(A[ii + 0][kk + 1]); A1j = _mm_set1_pd(A[ii + 1][kk + 1]); - Bk = _mm_loadu_pd(reinterpret_cast(&B[kk+1])); + Bk = _mm_loadu_pd(reinterpret_cast(&B[kk + 1])); C0j = _mm_add_pd(C0j, _mm_mul_pd(A0j, Bk)); C1j = _mm_add_pd(C1j, _mm_mul_pd(A1j, Bk)); -#if 0 +# if 0 const auto A01 = A[ii + 0][kk + 1]; const auto A11 = A[ii + 1][kk + 1]; const auto B10 = B[kk + 1][jj + 0]; @@ -194,7 +437,7 @@ inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) C01 += A01 * B11; C10 += A11 * B10; C11 += A11 * B11; -#endif +# endif } _mm_store_pd(&C[ii][0], C0j); @@ -202,13 +445,13 @@ inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) } } } -#elif 0 +# elif 0 /* Basic 2x2 blocked multiply: */ -inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) +inline void +mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) { for(int ii = 0; ii < 4; ii += 2) { for(int jj = 0; jj < 4; jj += 2) { - auto C00 = 0., C01 = 0., C10 = 0., C11 = 0.; for(int kk = 0; kk < 4; kk += 2) { const auto A00 = A[ii + 0][kk + 0]; @@ -240,8 +483,8 @@ inline void mxm_4x4(matrix44d& C, const matrix44d& A, const matrix44d& B) } } } -#endif +# endif #else -#error "No SIMD intrinsics defined?" +# error "No SIMD intrinsics defined?" #endif diff --git a/tests/timing/expr_mxm.inl b/tests/timing/expr_mxm.inl index 58b31cb..16878ab 100644 --- a/tests/timing/expr_mxm.inl +++ b/tests/timing/expr_mxm.inl @@ -1,16 +1,19 @@ -/* -*- C++ -*- ------------------------------------------------------------ +/*------------------------------------------------------------------------- @@COPYRIGHT@@ *-----------------------------------------------------------------------*/ -/** @file - * @brief - */ #pragma once +#ifndef CML_DISABLE_SIMD +# define CML_DISABLE_SIMD +#endif + #include using matrix44d = cml::matrix44d; -inline void mxm_4x4(matrix44d& res, const matrix44d& A, const matrix44d& B) + +inline void +mxm_4x4(matrix44d& res, const matrix44d& A, const matrix44d& B) { - res = A*B; -} \ No newline at end of file + res = A * B; +} diff --git a/tests/timing/expr_simd_mxm.inl b/tests/timing/expr_simd_mxm.inl new file mode 100644 index 0000000..1e18c09 --- /dev/null +++ b/tests/timing/expr_simd_mxm.inl @@ -0,0 +1,19 @@ +/*------------------------------------------------------------------------- + @@COPYRIGHT@@ + *-----------------------------------------------------------------------*/ + +#pragma once + +#ifdef CML_DISABLE_SIMD +#undef CML_DISABLE_SIMD +#endif + +#include + +using matrix44d = cml::matrix44d; + +inline void +mxm_4x4(matrix44d& res, const matrix44d& A, const matrix44d& B) +{ + res = A * B; +} diff --git a/tests/timing/make_rotation_matrix_pairs.h b/tests/timing/make_rotation_matrix_pairs.h index f94a9ef..a59e566 100644 --- a/tests/timing/make_rotation_matrix_pairs.h +++ b/tests/timing/make_rotation_matrix_pairs.h @@ -1,23 +1,20 @@ -/* -*- C++ -*- ------------------------------------------------------------ +/*------------------------------------------------------------------------- @@COPYRIGHT@@ *-----------------------------------------------------------------------*/ -/** @file - * @brief - */ #pragma once #include #include +#include -namespace cml { -namespace testing { +namespace cml::testing { -template -std::vector> +template +std::vector> make_rotation_matrix_pairs(int N); -}} // cml::testing +} // namespace cml::testing #define _CML_TESTING_MAKE_ROTATION_MATRIX_PAIRS_TPP #include "make_rotation_matrix_pairs.tpp" diff --git a/tests/timing/make_rotation_matrix_pairs.tpp b/tests/timing/make_rotation_matrix_pairs.tpp index 4410c43..3a9c12c 100644 --- a/tests/timing/make_rotation_matrix_pairs.tpp +++ b/tests/timing/make_rotation_matrix_pairs.tpp @@ -1,31 +1,40 @@ -/* -*- C++ -*- ------------------------------------------------------------ +/*------------------------------------------------------------------------- @@COPYRIGHT@@ *-----------------------------------------------------------------------*/ -/** @file - * @brief - */ #ifndef _CML_TESTING_MAKE_ROTATION_MATRIX_PAIRS_TPP -#error "make_rotation_matrix_pairs.tpp not included correctly" +# error "make_rotation_matrix_pairs.tpp not included correctly" #endif #include "uniform_random_rotation.h" -namespace cml { -namespace testing { +namespace cml::testing { -template -std::vector> +template +std::vector> make_rotation_matrix_pairs(int N) { - std::mt19937 rng(0xdeadbeef); - std::uniform_real_distribution<> d(0., std::nextafter(1.,2.)); - std::vector> rotations(N); - for(int i = 0; i < N; ++ i) { - cml::testing::uniform_random_rotation_4(rng, d, rotations[i].first); - cml::testing::uniform_random_rotation_4(rng, d, rotations[i].second); + std::random_device rd; + std::mt19937 rng(rd()); + std::uniform_real_distribution<> d(0., std::nextafter(1., 2.)); + std::vector> rotations(N); + for(int i = 0; i < N; ++i) { + auto& m1 = std::get<0>(rotations[i]); + auto& m2 = std::get<1>(rotations[i]); + cml::testing::uniform_random_rotation_4(rng, d, m1); + cml::testing::uniform_random_rotation_4(rng, d, m2); + + auto& m3 = std::get<2>(rotations[i]); + for(int row = 0; row < 4; ++row) { + for(int col = 0; col < 4; ++col) { + auto sum = 0.; + for(int k = 0; k < 4; ++k) + sum += cml::detail::get(m1, row, k) * cml::detail::get(m2, k, col); + m3(row, col) = sum; + } + } } return rotations; } -}} // cml::testing +} // namespace cml::testing diff --git a/tests/timing/matrix_mxm_timing.cpp b/tests/timing/matrix_mxm_timing.cpp index f324085..4b947b2 100644 --- a/tests/timing/matrix_mxm_timing.cpp +++ b/tests/timing/matrix_mxm_timing.cpp @@ -1,11 +1,9 @@ -/* -*- C++ -*- ------------------------------------------------------------ +/*------------------------------------------------------------------------- @@COPYRIGHT@@ *-----------------------------------------------------------------------*/ -/** @file - * @brief - */ #include +#include /* Need the matrix type and CML type system specializations: */ #include CML_TIMING_MxM_INL @@ -13,14 +11,14 @@ #include "timing.h" #include "make_rotation_matrix_pairs.h" -int main(int argc, const char** argv) +int +main(int argc, const char** argv) { long N = LONG_MAX; if(argc == 2) { char* a_end = nullptr; N = std::strtol(argv[1], &a_end, 10); } else { - // N = 1; N = 10000000; } @@ -34,26 +32,46 @@ int main(int argc, const char** argv) const auto rotations = cml::testing::make_rotation_matrix_pairs(N); const auto prep_time_end = cml::testing::usec_time(); const auto prep_time = prep_time_end - prep_time_start; - std::printf("prep time (%d pairs): %.5lf s\n", N, static_cast(prep_time) / 1e6); + std::printf("prep time (%d pairs): %.5lf s\n", N, + static_cast(prep_time) / 1e6); /* Time N multiplications: */ - using data_t = struct { matrix44d M; }; + using data_t = struct + { + matrix44d M; + }; + std::vector out(N); const auto mxm_time_start = cml::testing::usec_time(); - for(int i = 0; i < N; ++ i) { - mxm_4x4(out[i].M, rotations[i].first, rotations[i].second); + for(int i = 0; i < N; ++i) { + mxm_4x4(out[i].M, std::get<0>(rotations[i]), std::get<1>(rotations[i])); } const auto mxm_time_end = cml::testing::usec_time(); const auto mxm_time = mxm_time_end - mxm_time_start; - std::printf("mxm time (%d pairs): %.5lf s\n", N, static_cast(mxm_time) / 1e6); - - if(N == 1) for(int i = 0; i < N; ++ i) { - std::clog << rotations[i].first << std::endl; - std::clog << rotations[i].second << std::endl; - std::clog << out[i].M << std::endl; - cml::matrix44d A(rotations[i].first), B(rotations[i].second); - std::clog << A * B << std::endl; + std::printf("mxm time (%d pairs): %.5lf s\n", N, + static_cast(mxm_time) / 1e6); + + const auto check_time_start = cml::testing::usec_time(); + std::uint_fast64_t errors = 0; + for(int i = 0; i < N; ++i) { + const auto& A = std::get<2>(rotations[i]); + const auto& B = out[i].M; + + for(int j = 0; j < 4; ++j) { + for(int k = 0; k < 4; ++k) { + const auto diff = + std::abs(cml::detail::get(A, j, k) - cml::detail::get(B, j, k)); + if(diff > 10. * cml::scalar_traits::epsilon()) { + ++errors; + } + } + } } + const auto check_time_end = cml::testing::usec_time(); + const auto check_time = check_time_end - check_time_start; + std::printf("check time (%d pairs): %.5lf s\n", N, + static_cast(check_time) / 1e6); + if(errors > 0) std::printf(" Warning: found %llu errors\n", errors); return 0; -} \ No newline at end of file +} diff --git a/tests/timing/timing.cpp b/tests/timing/timing.cpp index 7dfcad0..86d8470 100644 --- a/tests/timing/timing.cpp +++ b/tests/timing/timing.cpp @@ -1,51 +1,49 @@ -/* -*- C++ -*- ------------------------------------------------------------ +/*------------------------------------------------------------------------- @@COPYRIGHT@@ *-----------------------------------------------------------------------*/ -/** @file - * @brief - */ #include "timing.h" #if defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#define NOCRYPT -#define NOGDI -#define NOSERVICE -#define NOMCX -#define NOIME -#include -#include -#include +# define WIN32_LEAN_AND_MEAN +# define NOCRYPT +# define NOGDI +# define NOSERVICE +# define NOMCX +# define NOIME +# include +# include +# include #else -#include +# include #endif -namespace cml { -namespace testing { +namespace cml::testing { #if defined(_WIN32) -usec_t usec_time() +usec_t +usec_time() { - LARGE_INTEGER liC, liF; + LARGE_INTEGER liC, liF; - /* Get counts per second: */ - QueryPerformanceFrequency(&liF); + /* Get counts per second: */ + QueryPerformanceFrequency(&liF); - /* Get current count: */ - QueryPerformanceCounter(&liC); + /* Get current count: */ + QueryPerformanceCounter(&liC); - LONGLONG freq = liF.QuadPart; - LONGLONG val = liC.QuadPart; + LONGLONG freq = liF.QuadPart; + LONGLONG val = liC.QuadPart; - /* Return total number of microseconds: */ - return (val*1000000ULL)/freq; + /* Return total number of microseconds: */ + return (val * 1000000ULL) / freq; } #else -usec_t usec_time() +usec_t +usec_time() { timeval tv; - gettimeofday(&tv,0); - return tv.tv_sec*1000000ULL + tv.tv_usec; + gettimeofday(&tv, 0); + return tv.tv_sec * 1000000ULL + tv.tv_usec; } #endif -}} // namespace cml::testing \ No newline at end of file +} // namespace cml::testing diff --git a/tests/timing/timing.h b/tests/timing/timing.h index 639fb45..b67ae89 100644 --- a/tests/timing/timing.h +++ b/tests/timing/timing.h @@ -1,14 +1,10 @@ -/* -*- C++ -*- ------------------------------------------------------------ - @@COPYRIGHT@@ +/*------------------------------------------------------------------------- + @@COPYRIGHT@@ *-----------------------------------------------------------------------*/ -/** @file - * @brief - */ #pragma once -namespace cml { -namespace testing { - using usec_t = unsigned long long; - usec_t usec_time(); -}} // cml::testing \ No newline at end of file +namespace cml::testing { +using usec_t = unsigned long long; +usec_t usec_time(); +} // namespace cml::testing diff --git a/tests/timing/uniform_random_rotation.h b/tests/timing/uniform_random_rotation.h index 87af7a1..0fb7df9 100644 --- a/tests/timing/uniform_random_rotation.h +++ b/tests/timing/uniform_random_rotation.h @@ -1,22 +1,19 @@ -/* -*- C++ -*- ------------------------------------------------------------ +/*------------------------------------------------------------------------- @@COPYRIGHT@@ *-----------------------------------------------------------------------*/ -/** @file - * @brief - */ #pragma once #include +#include -namespace cml { -namespace testing { +namespace cml::testing { -template -void uniform_random_rotation_4( - Gen& g, std::uniform_real_distribution<>& d, Matrix44T& M); +template +void uniform_random_rotation_4(Gen& g, std::uniform_real_distribution<>& d, + Matrix44dT& M); -}} // cml::testing +} // namespace cml::testing #define _CML_TESTING_UNIFORM_RANDOM_ROTATION_TPP #include "uniform_random_rotation.tpp" diff --git a/tests/timing/uniform_random_rotation.tpp b/tests/timing/uniform_random_rotation.tpp index 85a1fdb..80f794f 100644 --- a/tests/timing/uniform_random_rotation.tpp +++ b/tests/timing/uniform_random_rotation.tpp @@ -1,35 +1,31 @@ -/* -*- C++ -*- ------------------------------------------------------------ +/*------------------------------------------------------------------------- @@COPYRIGHT@@ *-----------------------------------------------------------------------*/ -/** @file - * @brief - */ #ifndef _CML_TESTING_UNIFORM_RANDOM_ROTATION_TPP -#error "uniform_random_rotation.tpp not included correctly" +# error "uniform_random_rotation.tpp not included correctly" #endif -#include #include #include #include -namespace cml { -namespace testing { +namespace cml::testing { /* NOTE: d must return values in [0.,1.) */ // TODO make this an official CML function? -template -void uniform_random_rotation_4( - Gen& g, std::uniform_real_distribution<>& d, Matrix44T& M) +template +void +uniform_random_rotation_4(Gen& g, std::uniform_real_distribution<>& d, + Matrix44dT& M) { - cml_require(0. <= d.a() && d.a() <= 1., - std::invalid_argument, "d not in [0,1.]"); + cml_require(0. <= d.a() && d.a() <= 1., std::invalid_argument, + "d not in [0,1.]"); /* First, create a random unit quaternion (see K. Shoemake, Graphics Gems * III): */ - using scalar_type = value_type_trait_of_t; + using scalar_type = value_type_trait_of_t; const auto two_pi = cml::constants::two_pi(); const auto u = d(g), u1 = d(g), u2 = d(g); const auto s1 = std::sqrt(1. - u), s2 = std::sqrt(u); @@ -40,8 +36,9 @@ void uniform_random_rotation_4( const auto z = std::sin(t2) * s2; const auto q = cml::quaterniond(x, y, z, w).normalize(); - cml::external44d M_out(std::addressof(cml::detail::get(M, 0, 0))); + cml::external44d M_out( + std::addressof(cml::detail::get(M, 0, 0))); cml::matrix_rotation_quaternion(M_out, q); } -}} // cml::testing +} // namespace cml::testing