From 1af16a8bdf3ef3b356054a9038afd2f0b94b0627 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 17 Dec 2015 14:10:23 -0500 Subject: [PATCH 01/45] Fixing issue with beta == 0 in AutoGemm kernels --- src/library/blas/AutoGemm/KernelOpenCL.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/library/blas/AutoGemm/KernelOpenCL.py b/src/library/blas/AutoGemm/KernelOpenCL.py index ccf266f4..d8b38716 100644 --- a/src/library/blas/AutoGemm/KernelOpenCL.py +++ b/src/library/blas/AutoGemm/KernelOpenCL.py @@ -162,12 +162,6 @@ def makeOpenCLKernelString(kernel): " REG.s0 = mad( -ALPHA.s1, REG.s1, REG.s0 ); \\\\" + endLine + " REG.s1 *= ALPHA.s0; \\\\" + endLine + " REG.s1 = mad( ALPHA.s1, type_mad_tmp, REG.s1 ); \\\\" + endLine + - " /* (2) */ \\\\" + endLine + - " REG.s0 = mad( BETA.s0, DST.s0, REG.s0 ); \\\\" + endLine + - " REG.s0 = mad( -BETA.s1, DST.s1, REG.s0 ); \\\\" + endLine + - " REG.s1 = mad( BETA.s1, DST.s0, REG.s1 ); \\\\" + endLine + - " REG.s1 = mad( BETA.s0, DST.s1, REG.s1 ); \\\\" + endLine + - " /* (3) */ \\\\" + endLine + " DST = REG;" + endLine ) #################################### From 969b5c6f2f8b71a44e9f064f3ac72de151caccd1 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 17 Dec 2015 16:59:25 -0500 Subject: [PATCH 02/45] Fixing integer divides to make clBLAS work when building with python3 --- src/library/blas/AutoGemm/KernelOpenCL.py | 4 ++-- src/library/blas/AutoGemm/KernelParameters.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/library/blas/AutoGemm/KernelOpenCL.py b/src/library/blas/AutoGemm/KernelOpenCL.py index d8b38716..d7835d56 100644 --- a/src/library/blas/AutoGemm/KernelOpenCL.py +++ b/src/library/blas/AutoGemm/KernelOpenCL.py @@ -351,11 +351,11 @@ def makeOpenCLKernelString(kernel): kStr += endLine kStr += " /* load global -> local */" + endLine numALoads = (kernel.workGroupNumRows*kernel.microTileNumRows*kernel.unroll) \ - / (kernel.workGroupNumRows*kernel.workGroupNumCols) + // (kernel.workGroupNumRows*kernel.workGroupNumCols) # // -- integer divide numALoadsR = (kernel.workGroupNumRows*kernel.microTileNumRows*kernel.unroll) \ % (kernel.workGroupNumRows*kernel.workGroupNumCols) numBLoads = (kernel.workGroupNumCols*kernel.microTileNumCols*kernel.unroll) \ - / (kernel.workGroupNumRows*kernel.workGroupNumCols) + // (kernel.workGroupNumRows*kernel.workGroupNumCols) # // - integer divide numBLoadsR = (kernel.workGroupNumCols*kernel.microTileNumCols*kernel.unroll) \ % (kernel.workGroupNumRows*kernel.workGroupNumCols) diff --git a/src/library/blas/AutoGemm/KernelParameters.py b/src/library/blas/AutoGemm/KernelParameters.py index b797d96c..565a62d3 100644 --- a/src/library/blas/AutoGemm/KernelParameters.py +++ b/src/library/blas/AutoGemm/KernelParameters.py @@ -89,11 +89,11 @@ def isValid(self): return True """ numALoads = (self.workGroupNumRows*self.microTileNumRows*self.unroll) \ - / (self.workGroupNumRows*self.workGroupNumCols) + // (self.workGroupNumRows*self.workGroupNumCols) numALoadsR = (self.workGroupNumRows*self.microTileNumRows*self.unroll) \ % (self.workGroupNumRows*self.workGroupNumCols) numBLoads = (self.workGroupNumCols*self.microTileNumCols*self.unroll) \ - / (self.workGroupNumRows*self.workGroupNumCols) + // (self.workGroupNumRows*self.workGroupNumCols) numBLoadsR = (self.workGroupNumCols*self.microTileNumCols*self.unroll) \ % (self.workGroupNumRows*self.workGroupNumCols) if (numALoads>0 and numALoadsR>0): From c41cc5dbd7a70dae98f4e055b9fa2ff5d4bde6d3 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Thu, 17 Dec 2015 18:58:55 -0500 Subject: [PATCH 03/45] Trtri kernel build options were hard coded to 2.0 --- src/library/blas/trtri/TrtriKernelSourceIncludes.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/library/blas/trtri/TrtriKernelSourceIncludes.h b/src/library/blas/trtri/TrtriKernelSourceIncludes.h index c7cc6bbb..b15ab78c 100644 --- a/src/library/blas/trtri/TrtriKernelSourceIncludes.h +++ b/src/library/blas/trtri/TrtriKernelSourceIncludes.h @@ -6,11 +6,11 @@ //**** compiler flags //**** online compilation flags -const char * const TrtriBuildOptions = "-cl-std=CL2.0"; -const char * const TrtribinBuildOptions = "-cl-std=CL2.0"; +const char * const TrtriBuildOptions = "-cl-std=CL" OPENCL_VERSION; +const char * const TrtribinBuildOptions = "-cl-std=CL" OPENCL_VERSION; /*mod 192 dtrsm*/ -extern const char * const diag_dtrtri_upper_192_12_src; +extern const char * const diag_dtrtri_upper_192_12_src; extern unsigned char *diag_dtrtri_upper_192_12_bin; extern size_t diag_dtrtri_upper_192_12_binSize; From de196fe9345fcfb6ea051b5198402b45fb04f0af Mon Sep 17 00:00:00 2001 From: Timmy Date: Tue, 5 Jan 2016 11:10:41 -0600 Subject: [PATCH 04/45] bump develop version number to 2.11.0 --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9ec932b1..8d98ecf4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -108,7 +108,7 @@ if( NOT DEFINED clBLAS_VERSION_MAJOR ) endif( ) if( NOT DEFINED clBLAS_VERSION_MINOR ) - set( clBLAS_VERSION_MINOR 9 ) + set( clBLAS_VERSION_MINOR 11 ) endif( ) if( NOT DEFINED clBLAS_VERSION_PATCH ) From a649bde30911efd5b1754ed59eca44ac982cd59a Mon Sep 17 00:00:00 2001 From: timmy Date: Wed, 6 Jan 2016 22:19:17 -0600 Subject: [PATCH 05/45] avoid removing userGemmClKernels.cc with make clean --- src/library/CMakeLists.txt | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt index 2a6ba237..1925b6c2 100644 --- a/src/library/CMakeLists.txt +++ b/src/library/CMakeLists.txt @@ -44,7 +44,6 @@ set(AUTOGEMM_HEADERS ) set(AUTOGEMM_SRC - ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/UserGemmKernelSources/UserGemmClKernels.cc ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmClKernels.cpp ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelBuildOptionsBinary.cpp ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelBinaries.cpp @@ -54,9 +53,9 @@ set(AUTOGEMM_SRC ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelSources.cpp ) -#set(USERGEMM_SRC -# ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/UserGemmKernelSources/UserGemmKernelSourceIncludes.cpp -#) +set(USERGEMM_SRC + ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/UserGemmKernelSources/UserGemmClKernels.cc +) set(USERGEMM_HEADERS ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/UserGemmKernelSources/UserGemmKernelSourceIncludes.h @@ -854,8 +853,8 @@ set(CLBLAS_ALL_SOURCES ${AUTOGEMM_HEADERS} ${AUTOGEMM_SCRIPTS} ${AUTOGEMM_PRECOMPILED_KERNELS_CONDITIONAL} - #${USERGEMM_SRC} - #${USERGEMM_HEADERS} + ${USERGEMM_SRC} + ${USERGEMM_HEADERS} ) add_definitions(-DOPENCL_VERSION="${OPENCL_VERSION}") add_library(clBLAS ${CLBLAS_ALL_SOURCES}) From 6041a3a43c1f8b599356c7303f16a0d41835f657 Mon Sep 17 00:00:00 2001 From: Timmy Date: Thu, 7 Jan 2016 14:47:17 -0600 Subject: [PATCH 06/45] fix some exception hanlers. now test-functional all pass --- src/CMakeLists.txt | 2 +- src/library/blas/xgemm.cc | 132 +++++++++++++++++++++++++--- src/tests/functional/func-error.cpp | 4 +- 3 files changed, 124 insertions(+), 14 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8d98ecf4..1fa849d1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,7 +18,7 @@ cmake_minimum_required(VERSION 2.8) #User toggle-able options that can be changed on the command line with -D option( BUILD_RUNTIME "Build the BLAS runtime library" ON ) -option( BUILD_TEST "Build the library testing suite (dependency on google test, Boost, and ACML)" ON ) +option( BUILD_TEST "Build the library testing suite (dependency on google test, Boost, and ACML/NETLIB BLAS)" ON ) option( BUILD_PERFORMANCE "Copy the performance scripts that can measure and graph performance" OFF ) option( BUILD_SAMPLE "Build the sample programs" OFF ) option( BUILD_CLIENT "Build a command line clBLAS client program with a variety of configurable parameters (dependency on Boost)" OFF ) diff --git a/src/library/blas/xgemm.cc b/src/library/blas/xgemm.cc index bfb133a0..71165dd4 100644 --- a/src/library/blas/xgemm.cc +++ b/src/library/blas/xgemm.cc @@ -61,6 +61,9 @@ static void force_gemm_column_major( printf("OpenCL error %i on line %u of %s\n", RET, __LINE__, __FILE__); \ assert(false); \ } +#define returnIfErr(err) \ + if (err != CL_SUCCESS)\ + return static_cast(err); const static unsigned int numGemmKernelArgs = 14; void *gemmKernelArgs[numGemmKernelArgs]; @@ -258,7 +261,7 @@ void makeGemmKernel( /****************************************************************************** * Enqueue Gemm Kernel *****************************************************************************/ - void enqueueGemmKernel( + cl_int enqueueGemmKernel( cl_command_queue clQueue, cl_kernel clKernel, void **kernelArgs, @@ -271,14 +274,20 @@ void makeGemmKernel( cl_event *clEvent) { for (unsigned int i = 0; i < numKernelArgs; i++) { - CL_CHECK( clSetKernelArg( clKernel, i, kernelArgSizes[i], kernelArgs[i]) ) + cl_int err = clSetKernelArg(clKernel, i, kernelArgSizes[i], kernelArgs[i]); + if (err != CL_SUCCESS) + return err; } /*printf("global={%llu, %llu} local={%llu, %llu}\n", globalWorkSize[0], globalWorkSize[1], localWorkSize[0], localWorkSize[1] );*/ - CL_CHECK( clEnqueueNDRangeKernel( clQueue, clKernel, - 2, NULL, globalWorkSize, localWorkSize, - numEventsInWaitList, eventWaitList, clEvent ) ) + cl_uint err = clEnqueueNDRangeKernel(clQueue, clKernel, + 2, NULL, globalWorkSize, localWorkSize, + numEventsInWaitList, eventWaitList, clEvent); + if (err != CL_SUCCESS) + return err; + + return CL_SUCCESS; } @@ -325,6 +334,8 @@ clblasGemm( const cl_event *eventWaitList, cl_event *events) { + + // cast types to opencl types cl_mem A = iA; cl_mem B = iB; @@ -389,10 +400,13 @@ clblasGemm( cl_int err; cl_device_id clDevice; err = clGetCommandQueueInfo( commandQueues[0], CL_QUEUE_DEVICE, sizeof(clDevice), &clDevice, NULL); - CL_CHECK(err) + //CL_CHECK(err) + returnIfErr(err); + cl_uint clDeviceNumCUs; err = clGetDeviceInfo( clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(clDeviceNumCUs), &clDeviceNumCUs, NULL); - CL_CHECK(err) + //CL_CHECK(err) + returnIfErr(err); unsigned int deviceIdealNumThreads = (8 /*waves per CU*/)*(64 /*threads per wave*/)*clDeviceNumCUs; float optimalNumElementsPerThread = ((float)M*N) / deviceIdealNumThreads; //optimalNumElementsPerThread = 32; @@ -562,11 +576,12 @@ clblasGemm( if (needTileKernel) { //printf("enqueueing tile kernel\n"); size_t globalWorkSize[2] = {(M/macroTileNumRows)*workGroupNumRows, (N/macroTileNumCols)*workGroupNumCols }; - enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *tileClKernel, + err = enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *tileClKernel, gemmKernelArgs, gemmKernelArgSizes, numGemmKernelArgs, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, &events[numKernelsEnqueued%numCommandQueues] ); + returnIfErr(err); numKernelsEnqueued++; } @@ -576,11 +591,12 @@ clblasGemm( if (needRowKernel) { //printf("enqueueing row kernel\n"); size_t globalWorkSize[2] = {1*workGroupNumRows, (N/macroTileNumCols)*workGroupNumCols }; - enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *rowClKernel, + err = enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *rowClKernel, gemmKernelArgs, gemmKernelArgSizes, numGemmKernelArgs, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, &events[numKernelsEnqueued%numCommandQueues] ); + returnIfErr(err); numKernelsEnqueued++; } @@ -590,11 +606,12 @@ clblasGemm( if (needColKernel) { //printf("enqueueing col kernel\n"); size_t globalWorkSize[2] = { (M/macroTileNumRows)*workGroupNumRows, 1*workGroupNumCols }; - enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *colClKernel, + err = enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *colClKernel, gemmKernelArgs, gemmKernelArgSizes, numGemmKernelArgs, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, &events[numKernelsEnqueued%numCommandQueues] ); + returnIfErr(err); numKernelsEnqueued++; } @@ -604,11 +621,12 @@ clblasGemm( if (needCornerKernel) { //printf("enqueueing corner kernel\n"); size_t globalWorkSize[2] = { 1*workGroupNumRows, 1*workGroupNumCols }; - enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *cornerClKernel, + err = enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *cornerClKernel, gemmKernelArgs, gemmKernelArgSizes, numGemmKernelArgs, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, &events[numKernelsEnqueued%numCommandQueues] ); + returnIfErr(err); numKernelsEnqueued++; } @@ -637,6 +655,29 @@ clblasSgemm( const cl_event *eventWaitList, cl_event *events) { + // check if memory objects are valid + clblasStatus clblasErr = clblasSuccess; + clblasErr = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + + if (K != 0) + { + //check matrix A + clblasErr = checkMatrixSizes(TYPE_FLOAT, order, transA, M, K, A, offA, lda, A_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + + //check matrix B + clblasErr = checkMatrixSizes(TYPE_FLOAT, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + } + //check matrix C + clblasErr = checkMatrixSizes(TYPE_FLOAT, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + return clblasGemm( order, transA, @@ -674,6 +715,29 @@ clblasDgemm( clblasOrder order, const cl_event *eventWaitList, cl_event *events) { + // check if memory objects are valid + clblasStatus clblasErr = clblasSuccess; + clblasErr = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + + if (K != 0) + { + //check matrix A + clblasErr = checkMatrixSizes(TYPE_DOUBLE, order, transA, M, K, A, offA, lda, A_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + + //check matrix B + clblasErr = checkMatrixSizes(TYPE_DOUBLE, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + } + //check matrix C + clblasErr = checkMatrixSizes(TYPE_DOUBLE, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + return clblasGemm( order, transA, @@ -712,6 +776,29 @@ clblasCgemm( const cl_event *eventWaitList, cl_event *events) { + // check if memory objects are valid + clblasStatus clblasErr = clblasSuccess; + clblasErr = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + + if (K != 0) + { + //check matrix A + clblasErr = checkMatrixSizes(TYPE_COMPLEX_FLOAT, order, transA, M, K, A, offA, lda, A_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + + //check matrix B + clblasErr = checkMatrixSizes(TYPE_COMPLEX_FLOAT, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + } + //check matrix C + clblasErr = checkMatrixSizes(TYPE_COMPLEX_FLOAT, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + return clblasGemm( order, transA, @@ -750,6 +837,29 @@ clblasZgemm( const cl_event *eventWaitList, cl_event *events) { + // check if memory objects are valid + clblasStatus clblasErr = clblasSuccess; + clblasErr = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + + if (K != 0) + { + //check matrix A + clblasErr = checkMatrixSizes(TYPE_COMPLEX_DOUBLE, order, transA, M, K, A, offA, lda, A_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + + //check matrix B + clblasErr = checkMatrixSizes(TYPE_COMPLEX_DOUBLE, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + } + //check matrix C + clblasErr = checkMatrixSizes(TYPE_COMPLEX_DOUBLE, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET); + if (clblasErr != clblasSuccess) + return clblasErr; + return clblasGemm( order, transA, diff --git a/src/tests/functional/func-error.cpp b/src/tests/functional/func-error.cpp index 064e1826..3a6bf51b 100644 --- a/src/tests/functional/func-error.cpp +++ b/src/tests/functional/func-error.cpp @@ -272,7 +272,7 @@ TEST(ERROR, InvalidMemObjectsymm) { } TEST(ERROR, InvalidValuesymm) { ErrorClass > ec; - ec.error(clblasInsufficientMemMatB); + ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicesymm) { @@ -512,7 +512,7 @@ TEST(ERROR, InvalidMemObjecthemm) { TEST(ERROR, InvalidValuehemm) { ErrorClass > ec; - ec.error(clblasInsufficientMemMatB); + ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicehemm) { From 9d4c312a012f6da830c57cd7db8f3d335fd81a9c Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Sun, 10 Jan 2016 12:30:29 -0500 Subject: [PATCH 07/45] Protect pragma in preprocessor macro by using _Pragma. clang 3.7 will not allow compilation of the code otherwise (found on FreeBSD-CURRENT). The solution employed here is equivalent to the one used in #189. --- .../dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp | 2 +- .../dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp | 2 +- .../dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp | 2 +- .../dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp | 2 +- .../dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp | 2 +- .../dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp index 6be60e1e..04c07e8a 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp @@ -18,7 +18,7 @@ const unsigned int dgemm_Col_NN_B0_MX048_NX048_KX08_microTileNumCols = 6; const unsigned int dgemm_Col_NN_B0_MX048_NX048_KX08_unroll = 8; const char * const dgemm_Col_NN_B0_MX048_NX048_KX08_src = STRINGIFY( -#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n +_Pragma("OPENCL EXTENSION cl_khr_fp64 : enable") \n #define M6x6 \ rA[0] = lA[offA + 0];\ diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp index 5b488814..05417daa 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp @@ -18,7 +18,7 @@ const unsigned int dgemm_Col_NN_B1_MX048_NX048_KX08_microTileNumCols = 6; const unsigned int dgemm_Col_NN_B1_MX048_NX048_KX08_unroll = 8; const char * const dgemm_Col_NN_B1_MX048_NX048_KX08_src = STRINGIFY( -#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n +_Pragma("OPENCL EXTENSION cl_khr_fp64 : enable") \n #define M6x6 \ rA[0] = lA[offA + 0]; \ diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp index 41300624..ffe879af 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp @@ -18,7 +18,7 @@ const unsigned int dgemm_Col_NT_B0_MX048_NX048_KX08_microTileNumCols = 6; const unsigned int dgemm_Col_NT_B0_MX048_NX048_KX08_unroll = 8; const char * const dgemm_Col_NT_B0_MX048_NX048_KX08_src = STRINGIFY( -#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n +_Pragma("OPENCL EXTENSION cl_khr_fp64 : enable") \n \n \ntypedef union _GPtr { \n __global float *f; diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp index 1ccd29d5..5af48fcb 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp @@ -18,7 +18,7 @@ const unsigned int dgemm_Col_NT_B1_MX048_NX048_KX08_microTileNumCols = 6; const unsigned int dgemm_Col_NT_B1_MX048_NX048_KX08_unroll = 8; const char * const dgemm_Col_NT_B1_MX048_NX048_KX08_src = STRINGIFY( -#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n +_Pragma("OPENCL EXTENSION cl_khr_fp64 : enable") \n \n \ntypedef union _GPtr { \n __global float *f; diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp index 91c7a9a4..1bed066f 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp @@ -18,7 +18,7 @@ const unsigned int dgemm_Col_TN_B0_MX048_NX048_KX08_microTileNumCols = 6; const unsigned int dgemm_Col_TN_B0_MX048_NX048_KX08_unroll = 8; const char * const dgemm_Col_TN_B0_MX048_NX048_KX08_src = STRINGIFY( -#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n +_Pragma("OPENCL EXTENSION cl_khr_fp64 : enable") \n __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_Col_TN_B0_MX048_NX048_KX08_src ( diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp index 79b0d0ef..927952b2 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp @@ -18,7 +18,7 @@ const unsigned int dgemm_Col_TN_B1_MX048_NX048_KX08_microTileNumCols = 6; const unsigned int dgemm_Col_TN_B1_MX048_NX048_KX08_unroll = 8; const char * const dgemm_Col_TN_B1_MX048_NX048_KX08_src = STRINGIFY( -#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n +_Pragma("OPENCL EXTENSION cl_khr_fp64 : enable") \n __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_Col_TN_B1_MX048_NX048_KX08_src ( From 5ac6253c66c237acc356e6b8b2a6ef68b8e7323b Mon Sep 17 00:00:00 2001 From: tingxingdong Date: Tue, 19 Jan 2016 11:30:28 -0600 Subject: [PATCH 08/45] fix a hard coding bug change 4 --> numQueues --- src/client/clfunc_xsyrk.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/clfunc_xsyrk.hpp b/src/client/clfunc_xsyrk.hpp index 7b7d1d86..c6f35964 100644 --- a/src/client/clfunc_xsyrk.hpp +++ b/src/client/clfunc_xsyrk.hpp @@ -458,7 +458,7 @@ call_func() clblasSsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, - buffer_.ldc_, 4, queues_, 0, NULL, &event_); + buffer_.ldc_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); From 7385f6807aabdf997dc8a3dcd65042940a6d39d3 Mon Sep 17 00:00:00 2001 From: tingxingdong Date: Tue, 19 Jan 2016 11:40:47 -0600 Subject: [PATCH 09/45] put the numQueues to be 1 In all the testers(.hpp files), memories are allocated only in the first queue (queue[0]). However, clBLAS kernels (except GEMM) are called within numQueues. This bug makes the client.exe fail for all kernels except GEMM. A quick patch is put numQueues=1 here. --- src/client/clfunc_common.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp index 3a66d616..fc2057ba 100644 --- a/src/client/clfunc_common.hpp +++ b/src/client/clfunc_common.hpp @@ -342,7 +342,7 @@ class clblasFunc cl_device_id device_; cl_context_properties props_[3]; cl_context ctx_; - static const unsigned int numQueues = 4; + static const unsigned int numQueues = 1; cl_command_queue queues_[numQueues]; clblasOrder order_; cl_event event_; From 627c6545591d16da159bc6f479070441916f827c Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 15 Jan 2016 20:28:13 -0500 Subject: [PATCH 10/45] Fixing issue with beta == 0 in UserGemm kernels Related to 1af16a8bdf3ef3b356054a9038afd2f0b94b0627 --- .../sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp | 72 +++++++++---------- .../sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp | 72 +++++++++---------- .../sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp | 72 +++++++++---------- 3 files changed, 108 insertions(+), 108 deletions(-) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp index 5151bde8..c1f92569 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp @@ -161,47 +161,47 @@ __kernel void sgemm_Col_NN_B0_MX096_NX096_KX16 ( C+= gidy*96*ldc; C+= idy*ldc; - C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc]; + C[0 *ldc] = alpha*rC[0][0]; + C[16*ldc] = alpha*rC[0][1]; + C[32*ldc] = alpha*rC[0][2]; + C[48*ldc] = alpha*rC[0][3]; + C[64*ldc] = alpha*rC[0][4]; + C[80*ldc] = alpha*rC[0][5]; C+=16; - C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc]; + C[0 *ldc] = alpha*rC[1][0]; + C[16*ldc] = alpha*rC[1][1]; + C[32*ldc] = alpha*rC[1][2]; + C[48*ldc] = alpha*rC[1][3]; + C[64*ldc] = alpha*rC[1][4]; + C[80*ldc] = alpha*rC[1][5]; C+=16; - C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc]; + C[0 *ldc] = alpha*rC[2][0]; + C[16*ldc] = alpha*rC[2][1]; + C[32*ldc] = alpha*rC[2][2]; + C[48*ldc] = alpha*rC[2][3]; + C[64*ldc] = alpha*rC[2][4]; + C[80*ldc] = alpha*rC[2][5]; C+=16; - C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc]; + C[0 *ldc] = alpha*rC[3][0]; + C[16*ldc] = alpha*rC[3][1]; + C[32*ldc] = alpha*rC[3][2]; + C[48*ldc] = alpha*rC[3][3]; + C[64*ldc] = alpha*rC[3][4]; + C[80*ldc] = alpha*rC[3][5]; C+=16; - C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc]; + C[0 *ldc] = alpha*rC[4][0]; + C[16*ldc] = alpha*rC[4][1]; + C[32*ldc] = alpha*rC[4][2]; + C[48*ldc] = alpha*rC[4][3]; + C[64*ldc] = alpha*rC[4][4]; + C[80*ldc] = alpha*rC[4][5]; C+=16; - C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc]; + C[0 *ldc] = alpha*rC[5][0]; + C[16*ldc] = alpha*rC[5][1]; + C[32*ldc] = alpha*rC[5][2]; + C[48*ldc] = alpha*rC[5][3]; + C[64*ldc] = alpha*rC[5][4]; + C[80*ldc] = alpha*rC[5][5]; } ); diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp index d22eca61..a8d0fec1 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp @@ -163,47 +163,47 @@ __kernel void sgemm_Col_NT_B0_MX096_NX096_KX16 ( C+= gidy*96*ldc; C+= idy*ldc; - C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc]; + C[0*ldc] = alpha*rC[0][0]; + C[16*ldc] = alpha*rC[0][1]; + C[32*ldc] = alpha*rC[0][2]; + C[48*ldc] = alpha*rC[0][3]; + C[64*ldc] = alpha*rC[0][4]; + C[80*ldc] = alpha*rC[0][5]; C+=16; - C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc]; + C[0*ldc] = alpha*rC[1][0]; + C[16*ldc] = alpha*rC[1][1]; + C[32*ldc] = alpha*rC[1][2]; + C[48*ldc] = alpha*rC[1][3]; + C[64*ldc] = alpha*rC[1][4]; + C[80*ldc] = alpha*rC[1][5]; C+=16; - C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc]; + C[0*ldc] = alpha*rC[2][0]; + C[16*ldc] = alpha*rC[2][1]; + C[32*ldc] = alpha*rC[2][2]; + C[48*ldc] = alpha*rC[2][3]; + C[64*ldc] = alpha*rC[2][4]; + C[80*ldc] = alpha*rC[2][5]; C+=16; - C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc]; + C[0*ldc] = alpha*rC[3][0]; + C[16*ldc] = alpha*rC[3][1]; + C[32*ldc] = alpha*rC[3][2]; + C[48*ldc] = alpha*rC[3][3]; + C[64*ldc] = alpha*rC[3][4]; + C[80*ldc] = alpha*rC[3][5]; C+=16; - C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc]; + C[0*ldc] = alpha*rC[4][0]; + C[16*ldc] = alpha*rC[4][1]; + C[32*ldc] = alpha*rC[4][2]; + C[48*ldc] = alpha*rC[4][3]; + C[64*ldc] = alpha*rC[4][4]; + C[80*ldc] = alpha*rC[4][5]; C+=16; - C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc]; + C[0*ldc] = alpha*rC[5][0]; + C[16*ldc] = alpha*rC[5][1]; + C[32*ldc] = alpha*rC[5][2]; + C[48*ldc] = alpha*rC[5][3]; + C[64*ldc] = alpha*rC[5][4]; + C[80*ldc] = alpha*rC[5][5]; } ); diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp index 2668bfa1..48323fc3 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp @@ -162,47 +162,47 @@ __kernel void sgemm_Col_TN_B0_MX096_NX096_KX16 ( C+= gidy*96*ldc; C+= idy*ldc; - C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc]; + C[0*ldc] = alpha*rC[0][0]; + C[16*ldc] = alpha*rC[0][1]; + C[32*ldc] = alpha*rC[0][2]; + C[48*ldc] = alpha*rC[0][3]; + C[64*ldc] = alpha*rC[0][4]; + C[80*ldc] = alpha*rC[0][5]; C+=16; - C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc]; + C[0*ldc] = alpha*rC[1][0]; + C[16*ldc] = alpha*rC[1][1]; + C[32*ldc] = alpha*rC[1][2]; + C[48*ldc] = alpha*rC[1][3]; + C[64*ldc] = alpha*rC[1][4]; + C[80*ldc] = alpha*rC[1][5]; C+=16; - C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc]; + C[0*ldc] = alpha*rC[2][0]; + C[16*ldc] = alpha*rC[2][1]; + C[32*ldc] = alpha*rC[2][2]; + C[48*ldc] = alpha*rC[2][3]; + C[64*ldc] = alpha*rC[2][4]; + C[80*ldc] = alpha*rC[2][5]; C+=16; - C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc]; + C[0*ldc] = alpha*rC[3][0]; + C[16*ldc] = alpha*rC[3][1]; + C[32*ldc] = alpha*rC[3][2]; + C[48*ldc] = alpha*rC[3][3]; + C[64*ldc] = alpha*rC[3][4]; + C[80*ldc] = alpha*rC[3][5]; C+=16; - C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc]; + C[0*ldc] = alpha*rC[4][0]; + C[16*ldc] = alpha*rC[4][1]; + C[32*ldc] = alpha*rC[4][2]; + C[48*ldc] = alpha*rC[4][3]; + C[64*ldc] = alpha*rC[4][4]; + C[80*ldc] = alpha*rC[4][5]; C+=16; - C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; - C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc]; - C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc]; - C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc]; - C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc]; - C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc]; + C[0*ldc] = alpha*rC[5][0]; + C[16*ldc] = alpha*rC[5][1]; + C[32*ldc] = alpha*rC[5][2]; + C[48*ldc] = alpha*rC[5][3]; + C[64*ldc] = alpha*rC[5][4]; + C[80*ldc] = alpha*rC[5][5]; } ); From 9c66a77e1d51db8dd5a4f2039f9d64236d50edf6 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 18 Jan 2016 13:53:16 -0500 Subject: [PATCH 11/45] Fixing issues for when Beta == 0 in sgemm special cases --- ..._Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp | 70 +++++++++++-------- ..._Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp | 68 ++++++++++-------- ..._Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp | 33 ++++----- ...emm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp | 20 +++++- ...emm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp | 18 +++-- ..._Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp | 58 +++++++++------ 6 files changed, 162 insertions(+), 105 deletions(-) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp index 831b5dc0..cc90ff90 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp @@ -57,46 +57,46 @@ __kernel void sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH ( float rC[2][2] = { {(float)0} }; float rA[1][2]; float rB[1][2]; - - + + A += offsetA; B += offsetB; C+=offsetC; - + __local float lA[528];//16*32+16 __local float lB[528]; - + uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); - + int CurrentOffSetA = gidx*32+ idx; int CurrentOffSetB = gidy*32+ idy; A += gidx*32+ idx + idy*lda; B += gidy*32*ldb+ idx + idy*ldb; - - + + uint block_k = K >> 4; - do + do { __local float* plA = lA + idy*33+idx; __local float* plB = lB + idx*33+idy; barrier(CLK_LOCAL_MEM_FENCE); - + plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb]; - + plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; - + barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; - + M2x2 M2x2 M2x2 @@ -123,26 +123,36 @@ __kernel void sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH ( int offset_y = gidy*32+ idy; if(offset_x>=M || offset_y>=N ) return; - + C+=offset_x+offset_y*ldc; - - + + int i = 0; - do - { - C[0 ] = mad(alpha, rC[i][0], beta*C[0]); - if(offset_y+16=M ) - return; - - - } - while (++i < 2); - + if (beta != 0) { + do + { + C[0 ] = mad(alpha, rC[i][0], beta*C[0]); + if(offset_y+16=M ) + return; + } + while (++i < 2); + } else { + do + { + C[0 ] = alpha * rC[i][0]; + if(offset_y+16=M ) + return; + } + while (++i < 2); + } } ); diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp index f50b5800..f26ddece 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp @@ -57,41 +57,41 @@ __kernel void sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH ( float rC[2][2] = { {(float)0} }; float rA[1][2]; float rB[1][2]; - - + + A += offsetA; B += offsetB; C+=offsetC; - + __local float lA[528];//16*32+16 __local float lB[528]; - + uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); - + int CurrentOffSetA = gidx*32+ idx; int CurrentOffSetB = gidy*32+ idx; - + A += gidx*32+ idx + idy*lda; B += gidy*32+ idx + idy*ldb; - - + + uint block_k = K >> 4; - do + do { __local float* plA = lA + idy*33+idx; __local float* plB = lB + idy*33+idx; barrier(CLK_LOCAL_MEM_FENCE); - + plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16]; - + plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; - + barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; @@ -126,23 +126,35 @@ __kernel void sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH ( return; C+=offset_x+offset_y*ldc; - - int i = 0; - do - { - C[0 ] = mad(alpha, rC[i][0], beta*C[0]); - if(offset_y+16=M ) - return; - - - } - while (++i < 2); + int i = 0; + if (beta !=0 ) { + do + { + C[0 ] = mad(alpha, rC[i][0], beta*C[0]); + if(offset_y+16=M ) + return; + } + while (++i < 2); + } else { + do + { + C[0 ] = alpha * rC[i][0]; + if(offset_y+16=M ) + return; + } + while (++i < 2); + } } ); #endif diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp index be5b219e..a01958f1 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp @@ -133,25 +133,26 @@ __kernel void sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE ( int offset_x = gidx * 64 + idx; int offset_y = gidy * 64 + idy; - //if(offset_x>=M || offset_y>=N ) - // return; - C += offset_x + offset_y*ldc; int i = 0; - do - { - C[0] = mad(alpha, rC[i][0], beta*C[0]); - C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]); - - - C += 16; - offset_x += 16; - //if(offset_x>=M ) - // return; - - - } while (++i < 2); + if (beta != 0) { + do + { + C[0] = mad(alpha, rC[i][0], beta*C[0]); + C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]); + C += 16; + offset_x += 16; + } while (++i < 2); + } else { + do + { + C[0] = alpha * rC[i][0]; + C[16 * ldc] = alpha * rC[i][1]; + C += 16; + offset_x += 16; + } while (++i < 2); + } } ); diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp index 5c414069..ae477cbe 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp @@ -145,8 +145,9 @@ __kernel void sgemm_Col_NT_B1_MX032_NX064_KX16_ROW ( C += offset_x + offset_y*ldc; int i = 0; - do - { + if (beta != 0) { + do + { C[0] = mad(alpha, rC[i][0], beta*C[0]); C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]); C[32 * ldc] = mad(alpha, rC[i][2], beta*C[32 * ldc]); @@ -155,7 +156,20 @@ __kernel void sgemm_Col_NT_B1_MX032_NX064_KX16_ROW ( offset_x += 16; //if(offset_x>=M ) // return; - } while (++i < 2); + } while (++i < 2); + } else { + do + { + C[0] = alpha * rC[i][0]; + C[16 * ldc] = alpha * rC[i][1]; + C[32 * ldc] = alpha * rC[i][2]; + C[48 * ldc] = alpha * rC[i][3]; + C += 16; + offset_x += 16; + //if(offset_x>=M ) + // return; + } while (++i < 2); + } } ); #endif diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp index 2c9e9ff5..3d39977e 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp @@ -143,15 +143,21 @@ __kernel void sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN ( C += offset_x + offset_y*ldc; int i = 0; - do - { + if (beta != 0) { + do + { C[0] = mad(alpha, rC[i][0], beta*C[0]); C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]); - C += 16; - - } while (++i < 4); - + } while (++i < 4); + } else { + do + { + C[0] = alpha * rC[i][0]; + C[16 * ldc] = alpha * rC[i][1]; + C += 16; + } while (++i < 4); + } } ); #endif diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp index 370ca885..a41a09ef 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp @@ -57,34 +57,34 @@ __kernel void sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src ( float rC[2][2] = { {(float)0} }; float rA[1][2]; float rB[1][2]; - - + + A += offsetA; B += offsetB; C+=offsetC; - + __local float lA[528];//16*32+16 __local float lB[528]; - + uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); - + int CurrentOffSetA = gidx*32+ idy; int CurrentOffSetB = gidy*32+ idy; A += (gidx*32+idy)*lda + idx; B += (gidy*32+idy)*ldb + idx; - - + + uint block_k = K >> 4; - do + do { __local float* plA = lA + idx*33+idy; __local float* plB = lB + idx*33+idy; barrier(CLK_LOCAL_MEM_FENCE); - + plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb]; @@ -127,21 +127,35 @@ __kernel void sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src ( C+=offset_x+offset_y*ldc; - int i = 0; - do - { - C[0 ] = mad(alpha, rC[i][0], beta*C[0]); - if(offset_y+16=M ) - return; - + int i = 0; + if (beta != 0) { + do + { + C[0 ] = mad(alpha, rC[i][0], beta*C[0]); + if(offset_y+16=M ) + return; + } + while (++i < 2); + } else { + do + { + C[0 ] = alpha * rC[i][0]; + if(offset_y+16=M ) + return; + } + while (++i < 2); } - while (++i < 2); } ); From d32081a26f766500a7786013a1acc2e708c15957 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Tue, 19 Jan 2016 17:21:35 -0500 Subject: [PATCH 12/45] Fix barriers in dtrsm specialized kernels --- src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp | 2 ++ src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp | 2 ++ src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp | 2 ++ src/library/blas/xtrsm.cc | 7 ++++--- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp b/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp index d60b482b..1f7c19c1 100644 --- a/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp +++ b/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp @@ -101,6 +101,7 @@ else\n }\n }\n +barrier(CLK_LOCAL_MEM_FENCE);\n /* * the lower case */ @@ -135,6 +136,7 @@ for (i = BLOCK_SIZE - 2; i >= 0; i--) {\n workspace[tx] = *(Bs + i*BLOCK_SIZE + tx); \n x = workspace + i + 1; \n y = Bs + i*BLOCK_SIZE; \n + barrier(CLK_LOCAL_MEM_FENCE);\n txw = (tx - i - 1); \n diff --git a/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp b/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp index d435d34e..bc9c2961 100644 --- a/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp +++ b/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp @@ -94,6 +94,7 @@ uint na)\n Bs[tx*BLOCK_SIZE+tx] = ONE / ( Bs[tx*BLOCK_SIZE+tx]) ;\n }\n }\n + barrier(CLK_LOCAL_MEM_FENCE);\n /* the upper case */ for( i=0; i < BLOCK_SIZE; i++ ) {\n @@ -110,6 +111,7 @@ uint na)\n //dtrmv workspace[tx] = *(Bs+i*BLOCK_SIZE+tx);\n y = Bs+i*BLOCK_SIZE;\n + barrier(CLK_LOCAL_MEM_FENCE);\n _Pragma("unroll")\n //for( j=tx; j < i; j++ ) diff --git a/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp b/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp index 9ea67088..0ffbebf7 100644 --- a/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp +++ b/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp @@ -94,6 +94,7 @@ else\n Bs[tx*BLOCK_SIZE + tx] = ONE / (Bs[tx*BLOCK_SIZE + tx]); \n }\n }\n +barrier(CLK_LOCAL_MEM_FENCE);\n /* the upper case */ @@ -111,6 +112,7 @@ for (i = 0; i < BLOCK_SIZE; i++) {\n //dtrmv workspace[tx] = *(Bs + i*BLOCK_SIZE + tx); \n y = Bs + i*BLOCK_SIZE; \n + barrier(CLK_LOCAL_MEM_FENCE);\n _Pragma("unroll")\n //for( j=tx; j < i; j++ ) diff --git a/src/library/blas/xtrsm.cc b/src/library/blas/xtrsm.cc index b6f553b5..c9325a36 100644 --- a/src/library/blas/xtrsm.cc +++ b/src/library/blas/xtrsm.cc @@ -1683,7 +1683,7 @@ clblasDtrsm( const cl_event *eventWaitList, cl_event *events) { - /* +#if 0 CHECK_QUEUES(numCommandQueues, commandQueues); CHECK_EVENTS(numEventsInWaitList, eventWaitList); @@ -1718,7 +1718,8 @@ clblasDtrsm( functor->release(); return res; - */ + +#else bool specialCaseHandled = false; //outer block size = 192 @@ -1780,7 +1781,7 @@ clblasDtrsm( numEventsInWaitList, eventWaitList, events); - +#endif } extern "C" From 3ec45fdb3fbf7c2d3a06ec58f01b00ad4b1e70da Mon Sep 17 00:00:00 2001 From: tingxingdong Date: Wed, 20 Jan 2016 10:14:39 -0600 Subject: [PATCH 13/45] fix a bug in gflops count n*(n+1)*n -> n*(n+1)*k --- src/client/clfunc_xsyrk.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/client/clfunc_xsyrk.hpp b/src/client/clfunc_xsyrk.hpp index c6f35964..67b30764 100644 --- a/src/client/clfunc_xsyrk.hpp +++ b/src/client/clfunc_xsyrk.hpp @@ -64,7 +64,7 @@ class xSyrk : public clblasFunc double gflops() { - return buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); + return buffer_.n_*(buffer_.n_+1)*buffer_.k_/time_in_ns(); } std::string gflops_formula() @@ -645,7 +645,7 @@ template<> double xSyrk::gflops() { - return 4*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); + return 4*buffer_.n_*(buffer_.n_+1)*buffer_.k_/time_in_ns(); } template<> From c716d405886dcc22d5d4e7a3681aaeb2d78f8cbb Mon Sep 17 00:00:00 2001 From: Anton Lokhmotov Date: Mon, 1 Feb 2016 12:48:27 +0000 Subject: [PATCH 14/45] Only use the -m32 or -m64 compiler flags on x86. --- src/CMakeLists.txt | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1fa849d1..249b3d28 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -296,9 +296,17 @@ if(CMAKE_COMPILER_IS_GNUCXX) # Don't use -rpath. set(CMAKE_SKIP_RPATH ON CACHE BOOL "Skip RPATH" FORCE) - set(CMAKE_C_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_C_FLAGS}") - set(CMAKE_CXX_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_CXX_FLAGS}") - set(CMAKE_Fortran_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_Fortran_FLAGS}") + # Need to determine the target machine of the C compiler, because + # the '-m32' and '-m64' flags are supported on x86 but not on e.g. ARM. + exec_program( "${CMAKE_C_COMPILER} -dumpmachine" + OUTPUT_VARIABLE CMAKE_C_COMPILER_MACHINE ) + message( STATUS "CMAKE_C_COMPILER_MACHINE: ${CMAKE_C_COMPILER_MACHINE}" ) + # The "86" regular expression matches x86, x86_64, i686, etc. + if(${CMAKE_C_COMPILER_MACHINE} MATCHES "86") + set(CMAKE_C_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_C_FLAGS}") + set(CMAKE_CXX_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_CXX_FLAGS}") + set(CMAKE_Fortran_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_Fortran_FLAGS}") + endif() if(TARGET_PLATFORM EQUAL 32) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-builtin") From 1f63b34b9757f0f5b3cef17e4eedc0f529fab470 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Thu, 25 Feb 2016 21:20:47 +0100 Subject: [PATCH 15/45] Update .travis.yml and appveyor.yml OpenCL headers were moved to github --- .travis.yml | 16 +++++++++------- appveyor.yml | 17 ++++++++++------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/.travis.yml b/.travis.yml index 43999194..ab440ba3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -113,19 +113,21 @@ install: - if [ ${TRAVIS_OS_NAME} == "linux" ]; then mkdir -p ${OPENCL_ROOT}; pushd ${OPENCL_ROOT}; - wget ${OPENCL_REGISTRY}/specs/opencl-icd-1.2.11.0.tgz; - tar -xf opencl-icd-1.2.11.0.tgz; - mv ./icd/* .; - mkdir -p inc/CL; + travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git; + mv ./OpenCL-ICD-Loader/* .; + travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL; pushd inc/CL; - wget -r -w 1 -np -nd -nv -A h,hpp https://www.khronos.org/registry/cl/api/1.2/; - wget -w 1 -np -nd -nv -A h,hpp https://www.khronos.org/registry/cl/api/2.1/cl.hpp; + travis_retry wget -w 1 -np -nd -nv -A h,hpp ${OPENCL_REGISTRY}/api/2.1/cl.hpp; popd; mkdir -p lib; pushd lib; cmake -G "Unix Makefiles" ..; make; - cp ../bin/libOpenCL.so .; + cp ./bin/libOpenCL.so .; + popd; + pushd inc/CL; + travis_retry git fetch origin opencl12:opencl12; + git checkout opencl12; popd; mv inc/ include/; popd; diff --git a/appveyor.yml b/appveyor.yml index bb1f2b34..7a175b23 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -40,15 +40,14 @@ install: - ps: mkdir $env:OPENCL_ROOT - ps: pushd $env:OPENCL_ROOT - ps: $opencl_registry = $env:OPENCL_REGISTRY - # This downloads the source to the example/demo icd library - - ps: wget $opencl_registry/specs/opencl-icd-1.2.11.0.tgz -OutFile opencl-icd-1.2.11.0.tgz - - ps: 7z x opencl-icd-1.2.11.0.tgz - - ps: 7z x opencl-icd-1.2.11.0.tar - - ps: mv .\icd\* . + # This downloads the source to the Khronos ICD library + - git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git + - ps: mv ./OpenCL-ICD-Loader/* . # This downloads all the opencl header files # The cmake build files expect a directory called inc - ps: mkdir inc/CL - - ps: wget $opencl_registry/api/1.2/ | select -ExpandProperty links | where {$_.href -like "*.h*"} | select -ExpandProperty outerText | foreach{ wget $opencl_registry/api/1.2/$_ -OutFile inc/CL/$_ } + - git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL + - ps: wget $opencl_registry/api/2.1/cl.hpp -OutFile inc/CL/cl.hpp # - ps: dir; if( $lastexitcode -eq 0 ){ dir include/CL } else { Write-Output boom } # Create the static import lib in a directory called lib, so findopencl() will find it - ps: mkdir lib @@ -56,10 +55,14 @@ install: - cmake -G "NMake Makefiles" .. - nmake - ps: popd + # Switch to OpenCL 1.2 headers + - ps: pushd inc/CL + - git fetch origin opencl12:opencl12 + - git checkout opencl12 + - ps: popd # Rename the inc directory to include, so FindOpencl() will find it - ps: ren inc include - ps: popd - - ps: popd # before_build is used to run configure steps before_build: From 3e2c8264bbd83058b5ad5e1df60aaab64fee011b Mon Sep 17 00:00:00 2001 From: David Tanner Date: Mon, 29 Feb 2016 15:20:16 -0600 Subject: [PATCH 16/45] proposed fix for gemm thread safety; using thread-local storage for kernel map using pre-C++-11 syntax --- src/library/blas/xgemm.cc | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/library/blas/xgemm.cc b/src/library/blas/xgemm.cc index 71165dd4..67e2e507 100644 --- a/src/library/blas/xgemm.cc +++ b/src/library/blas/xgemm.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -135,7 +136,17 @@ void makeGemmKernel( { //TODO: This will need to be converted to thread local when making clBLAS thread safe typedef std::map kernel_map_t; - static kernel_map_t kernel_map; + +#if defined( _WIN32 ) + __declspec( thread ) static kernel_map_t *kernel_map = 0; + + +#else + __thread static kernel_map_t *kernel_map = 0; +#endif + if (!kernel_map) { + kernel_map = new kernel_map_t(); + } cl_context clContext; cl_device_id clDevice; @@ -159,11 +170,11 @@ void makeGemmKernel( // Check if kernel exists for this device std::string key = prefix + "_" + kernelName; - kernel_map_t::iterator idx = kernel_map.find(key); + kernel_map_t::iterator idx = kernel_map->find(key); // If kernel not found for this device, set to NULL - if (idx == kernel_map.end()) { + if (idx == kernel_map->end()) { *clKernel = NULL; } else { *clKernel = idx->second; @@ -251,7 +262,7 @@ void makeGemmKernel( #endif std::string key = prefix + "_" + kernelName; - kernel_map[key] = *clKernel; + (*kernel_map)[key] = *clKernel; delete[] kernelName; } From 1ab9efd62cd5c071cb977f4d2ccdfb86c129eb26 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Mon, 29 Feb 2016 15:41:27 -0600 Subject: [PATCH 17/45] re-submit after CI fix; removing dummy whitespace --- src/library/blas/xgemm.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/library/blas/xgemm.cc b/src/library/blas/xgemm.cc index 67e2e507..5c273517 100644 --- a/src/library/blas/xgemm.cc +++ b/src/library/blas/xgemm.cc @@ -136,11 +136,9 @@ void makeGemmKernel( { //TODO: This will need to be converted to thread local when making clBLAS thread safe typedef std::map kernel_map_t; - + #if defined( _WIN32 ) __declspec( thread ) static kernel_map_t *kernel_map = 0; - - #else __thread static kernel_map_t *kernel_map = 0; #endif From 2b56167fbc887acaf42aa7707ab8cfcd3a8c468e Mon Sep 17 00:00:00 2001 From: David Tanner Date: Tue, 1 Mar 2016 10:26:54 -0600 Subject: [PATCH 18/45] TLS for gcc 4.6 --- src/library/blas/xgemm.cc | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/library/blas/xgemm.cc b/src/library/blas/xgemm.cc index 5c273517..69ddb7de 100644 --- a/src/library/blas/xgemm.cc +++ b/src/library/blas/xgemm.cc @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -28,6 +27,16 @@ // #include #include "xgemm.h" +//#define GCC_VERSION (__GNUC__ * 10000 \ +// + __GNUC_MINOR__ * 100 \ +// + __GNUC_PATCHLEVEL__) + +#ifdef _WIN32 +//#include +#else +#include +#endif + /****************************************************************************** * Row major -> column major *****************************************************************************/ @@ -140,6 +149,7 @@ void makeGemmKernel( #if defined( _WIN32 ) __declspec( thread ) static kernel_map_t *kernel_map = 0; #else +#include __thread static kernel_map_t *kernel_map = 0; #endif if (!kernel_map) { From 02cf387b9f196cc1083402258ead197aeff6c42e Mon Sep 17 00:00:00 2001 From: David Tanner Date: Wed, 2 Mar 2016 15:40:52 -0600 Subject: [PATCH 19/45] fixing duplicate include and removing TODO note --- src/library/blas/xgemm.cc | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/library/blas/xgemm.cc b/src/library/blas/xgemm.cc index 69ddb7de..5c1ea112 100644 --- a/src/library/blas/xgemm.cc +++ b/src/library/blas/xgemm.cc @@ -27,10 +27,6 @@ // #include #include "xgemm.h" -//#define GCC_VERSION (__GNUC__ * 10000 \ -// + __GNUC_MINOR__ * 100 \ -// + __GNUC_PATCHLEVEL__) - #ifdef _WIN32 //#include #else @@ -143,13 +139,11 @@ void makeGemmKernel( size_t *kernelBinarySize, const char *binaryBuildOptions) { - //TODO: This will need to be converted to thread local when making clBLAS thread safe typedef std::map kernel_map_t; #if defined( _WIN32 ) __declspec( thread ) static kernel_map_t *kernel_map = 0; #else -#include __thread static kernel_map_t *kernel_map = 0; #endif if (!kernel_map) { From 7a74778675abecd0d1a2b7158ad1061ef7b9d648 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Thu, 3 Mar 2016 14:31:47 -0600 Subject: [PATCH 20/45] compiling kernels is now thread safe; not using global cl_kernel objects --- src/library/blas/xgemm.cc | 82 +++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 34 deletions(-) diff --git a/src/library/blas/xgemm.cc b/src/library/blas/xgemm.cc index 5c1ea112..a79fdcdc 100644 --- a/src/library/blas/xgemm.cc +++ b/src/library/blas/xgemm.cc @@ -20,6 +20,7 @@ #include #include #include +#include "mutex.h" #include "AutoGemmIncludes/AutoGemmKernelSelection.h" #include "GemmSpecialCases.h" @@ -126,12 +127,38 @@ static char *getKernelName(cl_kernel clKernel) return kernelName; } +typedef struct kernel_map_key_ { + cl_context context; // address of context + cl_device_id device; // address of device + const char *kernelSource; // address of kernel source +} kernel_map_key; + +bool operator<(const kernel_map_key & l, const kernel_map_key & r) { + if (l.context < r.context) { + return true; + } else if (r.context < l.context) { + return false; + } + if (l.device < r.device) { + return true; + } else if (r.device < l.device) { + return false; + } + if (l.kernelSource < r.kernelSource) { + return true; + } else if (r.kernelSource < l.kernelSource) { + return false; + } + return false; +} + + /****************************************************************************** * Make Gemm Kernel *****************************************************************************/ //FIXME: This function should be returning an error. void makeGemmKernel( - cl_kernel *clKernel, + cl_kernel *clKernel, // ignored as input; returns as output cl_command_queue clQueue, const char *kernelSource, const char *sourceBuildOptions, @@ -139,9 +166,8 @@ void makeGemmKernel( size_t *kernelBinarySize, const char *binaryBuildOptions) { - typedef std::map kernel_map_t; - -#if defined( _WIN32 ) + typedef std::map kernel_map_t; + #if defined( _WIN32 ) __declspec( thread ) static kernel_map_t *kernel_map = 0; #else __thread static kernel_map_t *kernel_map = 0; @@ -159,33 +185,20 @@ void makeGemmKernel( err = clGetCommandQueueInfo( clQueue, CL_QUEUE_DEVICE, sizeof(clDevice), &clDevice, NULL); CL_CHECK(err) - std::stringstream ss; - ss << clDevice << "_" << clContext; - std::string prefix = ss.str(); - - if (*clKernel) { - char *kernelName = getKernelName(*clKernel); - // kernel has already been built, return -#ifdef AUTOGEMM_PRINT_DEBUG - printf("makeGemmKernel: \"%s\" already built; returning.\n", kernelName); -#endif - - // Check if kernel exists for this device - std::string key = prefix + "_" + kernelName; - kernel_map_t::iterator idx = kernel_map->find(key); - - - // If kernel not found for this device, set to NULL - if (idx == kernel_map->end()) { - *clKernel = NULL; - } else { - *clKernel = idx->second; - } - - delete[] kernelName; + // is kernel already compiled? + kernel_map_key key; + key.kernelSource = kernelSource; + key.context = clContext; + key.device = clDevice; + kernel_map_t::iterator idx = kernel_map->find(key); + if (idx == kernel_map->end()) { + *clKernel = NULL; + } else { + *clKernel = idx->second; + return; } - if (!*clKernel) { + if (true /*!*clKernel*/) { // since kernel wasn't found in map // kernel has not been built, so build it (from binary, preferably) cl_program clProgram; cl_int clBinaryStatus; @@ -257,17 +270,13 @@ void makeGemmKernel( err = clReleaseProgram(clProgram); CL_CHECK(err) - char *kernelName = getKernelName(*clKernel); - #ifdef AUTOGEMM_PRINT_DEBUG printf("makeGemmKernel: \"%s\" now built; returning.\n", kernelName); #endif - std::string key = prefix + "_" + kernelName; + //put kernel in map (*kernel_map)[key] = *clKernel; - delete[] kernelName; } - return; } @@ -557,6 +566,11 @@ clblasGemm( /****************************************************************************** * Build kernels *****************************************************************************/ + + tileClKernel = NULL; + rowClKernel = NULL; + colClKernel = NULL; + cornerClKernel = NULL; if (needTileKernel) makeGemmKernel( tileClKernel, commandQueues[0], tileKernelSource, sourceBuildOptions, &tileKernelBinary, tileKernelBinarySize, binaryBuildOptions); if (needRowKernel) makeGemmKernel( rowClKernel, commandQueues[0], rowKernelSource, sourceBuildOptions, &rowKernelBinary, rowKernelBinarySize, binaryBuildOptions); if (needColKernel) makeGemmKernel( colClKernel, commandQueues[0], colKernelSource, sourceBuildOptions, &colKernelBinary, colKernelBinarySize, binaryBuildOptions); From c590881ff9eea7f92eeb3d0555c5b8bef0d8f42e Mon Sep 17 00:00:00 2001 From: David Tanner Date: Thu, 3 Mar 2016 15:58:04 -0600 Subject: [PATCH 21/45] thread safety: no longer using global cl_kernel objects. thread safety is fixed pending customer verification --- src/library/blas/xgemm.cc | 51 ++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/src/library/blas/xgemm.cc b/src/library/blas/xgemm.cc index a79fdcdc..3e99ce4d 100644 --- a/src/library/blas/xgemm.cc +++ b/src/library/blas/xgemm.cc @@ -158,7 +158,7 @@ bool operator<(const kernel_map_key & l, const kernel_map_key & r) { *****************************************************************************/ //FIXME: This function should be returning an error. void makeGemmKernel( - cl_kernel *clKernel, // ignored as input; returns as output + cl_kernel *clKernel, // ignored as input; returns as output only cl_command_queue clQueue, const char *kernelSource, const char *sourceBuildOptions, @@ -461,10 +461,10 @@ clblasGemm( size_t *colKernelBinarySize = 0; size_t *cornerKernelBinarySize = 0; const char *binaryBuildOptions = NULL; - cl_kernel *tileClKernel = NULL; - cl_kernel *rowClKernel = NULL; - cl_kernel *colClKernel = NULL; - cl_kernel *cornerClKernel = NULL; + cl_kernel *tileClKernelDummy = NULL; // no longer used; broke thread safety + cl_kernel *rowClKernelDummy = NULL; // no longer used; broke thread safety + cl_kernel *colClKernelDummy = NULL; // no longer used; broke thread safety + cl_kernel *cornerClKernelDummy = NULL; // no longer used; broke thread safety unsigned int workGroupNumRows; unsigned int workGroupNumCols; unsigned int microTileNumRows; @@ -489,10 +489,10 @@ clblasGemm( &colKernelBinarySize, &cornerKernelBinarySize, &binaryBuildOptions, - &tileClKernel, - &rowClKernel, - &colClKernel, - &cornerClKernel, + &tileClKernelDummy, + &rowClKernelDummy, + &colClKernelDummy, + &cornerClKernelDummy, &workGroupNumRows, &workGroupNumCols, µTileNumRows, @@ -530,10 +530,10 @@ clblasGemm( &colKernelBinarySize, &cornerKernelBinarySize, &binaryBuildOptions, - &tileClKernel, - &rowClKernel, - &colClKernel, - &cornerClKernel, + &tileClKernelDummy, + &rowClKernelDummy, + &colClKernelDummy, + &cornerClKernelDummy, &workGroupNumRows, &workGroupNumCols, µTileNumRows, @@ -567,14 +567,15 @@ clblasGemm( * Build kernels *****************************************************************************/ - tileClKernel = NULL; - rowClKernel = NULL; - colClKernel = NULL; - cornerClKernel = NULL; - if (needTileKernel) makeGemmKernel( tileClKernel, commandQueues[0], tileKernelSource, sourceBuildOptions, &tileKernelBinary, tileKernelBinarySize, binaryBuildOptions); - if (needRowKernel) makeGemmKernel( rowClKernel, commandQueues[0], rowKernelSource, sourceBuildOptions, &rowKernelBinary, rowKernelBinarySize, binaryBuildOptions); - if (needColKernel) makeGemmKernel( colClKernel, commandQueues[0], colKernelSource, sourceBuildOptions, &colKernelBinary, colKernelBinarySize, binaryBuildOptions); - if (needCornerKernel) makeGemmKernel(cornerClKernel, commandQueues[0], cornerKernelSource, sourceBuildOptions, &cornerKernelBinary, cornerKernelBinarySize, binaryBuildOptions); + + cl_kernel tileClKernel = NULL; + cl_kernel rowClKernel = NULL; + cl_kernel colClKernel = NULL; + cl_kernel cornerClKernel = NULL; + if (needTileKernel) makeGemmKernel( &tileClKernel, commandQueues[0], tileKernelSource, sourceBuildOptions, &tileKernelBinary, tileKernelBinarySize, binaryBuildOptions); + if (needRowKernel) makeGemmKernel( &rowClKernel, commandQueues[0], rowKernelSource, sourceBuildOptions, &rowKernelBinary, rowKernelBinarySize, binaryBuildOptions); + if (needColKernel) makeGemmKernel( &colClKernel, commandQueues[0], colKernelSource, sourceBuildOptions, &colKernelBinary, colKernelBinarySize, binaryBuildOptions); + if (needCornerKernel) makeGemmKernel(&cornerClKernel, commandQueues[0], cornerKernelSource, sourceBuildOptions, &cornerKernelBinary, cornerKernelBinarySize, binaryBuildOptions); const size_t localWorkSize[2] = { workGroupNumRows, workGroupNumCols }; unsigned int numKernelsEnqueued = 0; @@ -603,7 +604,7 @@ clblasGemm( if (needTileKernel) { //printf("enqueueing tile kernel\n"); size_t globalWorkSize[2] = {(M/macroTileNumRows)*workGroupNumRows, (N/macroTileNumCols)*workGroupNumCols }; - err = enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *tileClKernel, + err = enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], tileClKernel, gemmKernelArgs, gemmKernelArgSizes, numGemmKernelArgs, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, @@ -618,7 +619,7 @@ clblasGemm( if (needRowKernel) { //printf("enqueueing row kernel\n"); size_t globalWorkSize[2] = {1*workGroupNumRows, (N/macroTileNumCols)*workGroupNumCols }; - err = enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *rowClKernel, + err = enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], rowClKernel, gemmKernelArgs, gemmKernelArgSizes, numGemmKernelArgs, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, @@ -633,7 +634,7 @@ clblasGemm( if (needColKernel) { //printf("enqueueing col kernel\n"); size_t globalWorkSize[2] = { (M/macroTileNumRows)*workGroupNumRows, 1*workGroupNumCols }; - err = enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *colClKernel, + err = enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], colClKernel, gemmKernelArgs, gemmKernelArgSizes, numGemmKernelArgs, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, @@ -648,7 +649,7 @@ clblasGemm( if (needCornerKernel) { //printf("enqueueing corner kernel\n"); size_t globalWorkSize[2] = { 1*workGroupNumRows, 1*workGroupNumCols }; - err = enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *cornerClKernel, + err = enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], cornerClKernel, gemmKernelArgs, gemmKernelArgSizes, numGemmKernelArgs, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, From ed8ee7e768b9621886650f69ae0474db00377798 Mon Sep 17 00:00:00 2001 From: tim Date: Fri, 18 Mar 2016 14:24:04 -0500 Subject: [PATCH 22/45] fix the compilation bug about c(z)dotc_ --- src/tests/correctness/blas-lapack.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tests/correctness/blas-lapack.c b/src/tests/correctness/blas-lapack.c index 9687bdf3..54666b13 100644 --- a/src/tests/correctness/blas-lapack.c +++ b/src/tests/correctness/blas-lapack.c @@ -655,7 +655,7 @@ complex cdotu( int n, complex *x, int incx, complex *y, int incy) #elif defined( __APPLE__) cblas_cdotu_sub(n, x, incx, y, incy, &ans); #else - cdotusub_(&n, x, &incx, y, &incy, &ans); + ans = cdotu_(&n, x, &incx, y, &incy); #endif return ans; @@ -670,7 +670,7 @@ doublecomplex zdotu( int n, doublecomplex *x, int incx, doublecomplex *y, int i #elif defined(__APPLE__) cblas_zdotu_sub(n, x, incx, y, incy, &ans); #else - zdotusub_(&n, x, &incx, y, &incy, &ans); + ans = zdotu_(&n, x, &incx, y, &incy); #endif return ans; @@ -685,7 +685,7 @@ complex cdotc( int n, complex *x, int incx, complex *y, int incy) #elif defined(__APPLE__) cblas_cdotc_sub(n, x, incx, y, incy, &ans); #else - cdotcsub_(&n, x, &incx, y, &incy, &ans); + ans = cdotc_(&n, x, &incx, y, &incy); #endif return ans; @@ -700,7 +700,7 @@ doublecomplex zdotc( int n, doublecomplex *x, int incx, doublecomplex *y, int i #elif defined(__APPLE__) cblas_zdotc_sub(n, x, incx, y, incy, &ans); #else - zdotcsub_(&n, x, &incx, y, &incy, &ans); + ans = zdotc_(&n, x, &incx, y, &incy); #endif return ans; From 24656629327f229348eb503172d4f2e6888c3411 Mon Sep 17 00:00:00 2001 From: tim Date: Tue, 22 Mar 2016 17:46:40 -0500 Subject: [PATCH 23/45] fix the header accordingly --- src/tests/correctness/blas-lapack.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tests/correctness/blas-lapack.h b/src/tests/correctness/blas-lapack.h index d2db1aa3..8619e1e7 100644 --- a/src/tests/correctness/blas-lapack.h +++ b/src/tests/correctness/blas-lapack.h @@ -1170,10 +1170,10 @@ double ddot_(int *n, double *x, int *incx, double* y, int *incy); complex cdotc_(int *n, complex *x, int *incx, complex* y, int *incy); doublecomplex zdotc_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy); #else - void cdotusub_(int *n, complex *x, int *incx, complex* y, int *incy, complex *ans); - void zdotusub_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy, doublecomplex *ans); - void cdotcsub_(int *n, complex *x, int *incx, complex* y, int *incy, complex *ans); - void zdotcsub_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy, doublecomplex *ans); + complex cdotu_(int *n, complex *x, int *incx, complex* y, int *incy); + doublecomplex zdotu_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy); + complex cdotc_(int *n, complex *x, int *incx, complex* y, int *incy); + doublecomplex zdotc_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy); #endif void sswap_(int *n, float *x, int *incx, float* y, int *incy); From 8491085874c29f2fbc4b5a1d8f70bc3e15b1f87a Mon Sep 17 00:00:00 2001 From: David Tanner Date: Thu, 24 Mar 2016 21:57:36 -0500 Subject: [PATCH 24/45] fixed compareMatrices to use GTEST_FLOAT_EQ --- src/tests/common.cpp | 21 ++++ src/tests/include/common.h | 4 + src/tests/include/matrix.h | 217 +++++-------------------------------- 3 files changed, 51 insertions(+), 191 deletions(-) diff --git a/src/tests/common.cpp b/src/tests/common.cpp index d0f21ba4..209a598a 100644 --- a/src/tests/common.cpp +++ b/src/tests/common.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -1016,3 +1017,23 @@ functionBlasLevel(BlasFunctionID funct) { return 0; } } + + +template<> +void gtestAssertElementsEqual( const float & a, const float & b) { + ASSERT_FLOAT_EQ(a, b); +} +template<> +void gtestAssertElementsEqual( const double & a, const double & b) { + ASSERT_DOUBLE_EQ(a, b); +} +template<> +void gtestAssertElementsEqual( const FloatComplex & a, const FloatComplex & b) { + ASSERT_FLOAT_EQ(CREAL(a), CREAL(b)); + ASSERT_FLOAT_EQ(CIMAG(a), CIMAG(b)); +} +template<> +void gtestAssertElementsEqual( const DoubleComplex & a, const DoubleComplex & b) { + ASSERT_DOUBLE_EQ(CREAL(a), CREAL(b)); + ASSERT_DOUBLE_EQ(CIMAG(a), CIMAG(b)); +} \ No newline at end of file diff --git a/src/tests/include/common.h b/src/tests/include/common.h index edbdb434..52f71b1d 100644 --- a/src/tests/include/common.h +++ b/src/tests/include/common.h @@ -692,6 +692,10 @@ printTestParams( size_t offx, int incx); + +template +void gtestAssertElementsEqual( const T & a, const T & b); + #endif // __cplusplus #endif /* COMMON_H_ */ diff --git a/src/tests/include/matrix.h b/src/tests/include/matrix.h index 65757add..ad849948 100644 --- a/src/tests/include/matrix.h +++ b/src/tests/include/matrix.h @@ -298,6 +298,7 @@ reorderMatrix( } } + template static void compareMatrices( @@ -315,207 +316,41 @@ compareMatrices( if( lda > 0 ) // General case { - for (m = 0; m < M; m++) { - for (n = 0; n < N; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); - delta = 0.0; - if (absDelta != NULL) { - delta = absDelta[m * N + n]; + for (m = 0; m < M; m++) { + for (n = 0; n < N; n++) { + a = getElement(order, clblasNoTrans, m, n, A, lda); + b = getElement(order, clblasNoTrans, m, n, B, lda); + gtestAssertElementsEqual(a, b); } - if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(a, b, delta); } } - } else // Packed case { - if ( order == clblasColumnMajor) - { - for ( n = 0; n < N; n++) - { - for( m=n; m < M; m++) - { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); - delta = 0.0; - if (absDelta != NULL) { - //delta = absDelta[m * N + n]; - } - if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(a, b, delta); - } - } - } - else - { - for ( m = 0; m < M; m++) - { - for( n = 0; n <= m; n++) - { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); - delta = 0.0; - if (absDelta != NULL) { - //delta = absDelta[m * N + n]; - } - if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(a, b, delta); - } - } - } - } -} - -template<> -__template_static void -compareMatrices( - clblasOrder order, - size_t M, - size_t N, - const FloatComplex *A, - const FloatComplex *B, - size_t lda, - const cl_double *absDelta) -{ - size_t m = 0, n = 0; - FloatComplex a, b; - cl_double delta; - -if ( lda > 0 ) -{ - for (m = 0; m < M; m++) { - for (n = 0; n < N; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); - delta = 0.0; - if (absDelta != NULL) { - delta = absDelta[m * N + n]; + if ( order == clblasColumnMajor) + { + for ( n = 0; n < N; n++) + { + for( m=n; m < M; m++) + { + a = getElement(order, clblasNoTrans, m, n, A, lda); + b = getElement(order, clblasNoTrans, m, n, B, lda); + gtestAssertElementsEqual(a, b); + } } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) - printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); } - } -} - else // Packed case - { - if ( order == clblasColumnMajor) - { - for ( n = 0; n < N; n++) - { - for( m=n; m < M; m++) - { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); - delta = 0.0; - if (absDelta != NULL) { - //delta = absDelta[m * N + n]; - } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) - printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); - } - } - } - else - { - for ( m = 0; m < M; m++) - { - for( n = 0; n <= m; n++) - { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); - delta = 0.0; - if (absDelta != NULL) { - //delta = absDelta[m * N + n]; - } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) - printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); - } - } - } - } - -} - -template<> -__template_static void -compareMatrices( - clblasOrder order, - size_t M, - size_t N, - const DoubleComplex *A, - const DoubleComplex *B, - size_t lda, - const cl_double *absDelta) -{ - size_t m = 0, n = 0; - DoubleComplex a, b; - cl_double delta; -if( lda > 0 ) -{ - for (m = 0; m < M; m++) { - for (n = 0; n < N; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); - delta = 0.0; - if (absDelta != NULL) { - delta = absDelta[m * N + n]; + else + { + for ( m = 0; m < M; m++) + { + for( n = 0; n <= m; n++) + { + a = getElement(order, clblasNoTrans, m, n, A, lda); + b = getElement(order, clblasNoTrans, m, n, B, lda); + gtestAssertElementsEqual(a, b); + } } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) - printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); } } -} - else // Packed case - { - if ( order == clblasColumnMajor) - { - for ( n = 0; n < N; n++) - { - for( m=n; m < M; m++) - { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); - delta = 0.0; - if (absDelta != NULL) { - //delta = absDelta[m * N + n]; - } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) - printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); - } - } - } - else - { - for ( m = 0; m < M; m++) - { - for( n = 0; n <= m; n++) - { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); - delta = 0.0; - if (absDelta != NULL) { - //delta = absDelta[m * N + n]; - } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) - printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); - } - } - } - } - } template From 184bb077be057be5e8760d09afd0d32fb5e8a284 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sun, 27 Mar 2016 23:53:51 +0200 Subject: [PATCH 25/45] fix error with missing KernelName variable --- src/library/blas/xgemm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/library/blas/xgemm.cc b/src/library/blas/xgemm.cc index 3e99ce4d..eb781127 100644 --- a/src/library/blas/xgemm.cc +++ b/src/library/blas/xgemm.cc @@ -271,7 +271,7 @@ void makeGemmKernel( CL_CHECK(err) #ifdef AUTOGEMM_PRINT_DEBUG - printf("makeGemmKernel: \"%s\" now built; returning.\n", kernelName); + printf("makeGemmKernel now built; returning.\n"); #endif //put kernel in map From d103fee751974b2f132d4b9925cea56714f77e4f Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 30 Mar 2016 11:04:45 +0200 Subject: [PATCH 26/45] Add .pyc files to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 5254baf3..bb362782 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,6 @@ src/build/ +# python compiled files +*.pyc + From be56a613dca5462eb569dc0072769fb4da0cce7e Mon Sep 17 00:00:00 2001 From: Kent Knox Date: Wed, 13 Apr 2016 13:20:36 -0500 Subject: [PATCH 27/45] Revert "fixed compareMatrices to use GTEST_FLOAT_EQ" This reverts commit 8491085874c29f2fbc4b5a1d8f70bc3e15b1f87a. --- src/tests/common.cpp | 21 ---- src/tests/include/common.h | 4 - src/tests/include/matrix.h | 217 ++++++++++++++++++++++++++++++++----- 3 files changed, 191 insertions(+), 51 deletions(-) diff --git a/src/tests/common.cpp b/src/tests/common.cpp index 209a598a..d0f21ba4 100644 --- a/src/tests/common.cpp +++ b/src/tests/common.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include @@ -1017,23 +1016,3 @@ functionBlasLevel(BlasFunctionID funct) { return 0; } } - - -template<> -void gtestAssertElementsEqual( const float & a, const float & b) { - ASSERT_FLOAT_EQ(a, b); -} -template<> -void gtestAssertElementsEqual( const double & a, const double & b) { - ASSERT_DOUBLE_EQ(a, b); -} -template<> -void gtestAssertElementsEqual( const FloatComplex & a, const FloatComplex & b) { - ASSERT_FLOAT_EQ(CREAL(a), CREAL(b)); - ASSERT_FLOAT_EQ(CIMAG(a), CIMAG(b)); -} -template<> -void gtestAssertElementsEqual( const DoubleComplex & a, const DoubleComplex & b) { - ASSERT_DOUBLE_EQ(CREAL(a), CREAL(b)); - ASSERT_DOUBLE_EQ(CIMAG(a), CIMAG(b)); -} \ No newline at end of file diff --git a/src/tests/include/common.h b/src/tests/include/common.h index 52f71b1d..edbdb434 100644 --- a/src/tests/include/common.h +++ b/src/tests/include/common.h @@ -692,10 +692,6 @@ printTestParams( size_t offx, int incx); - -template -void gtestAssertElementsEqual( const T & a, const T & b); - #endif // __cplusplus #endif /* COMMON_H_ */ diff --git a/src/tests/include/matrix.h b/src/tests/include/matrix.h index ad849948..65757add 100644 --- a/src/tests/include/matrix.h +++ b/src/tests/include/matrix.h @@ -298,7 +298,6 @@ reorderMatrix( } } - template static void compareMatrices( @@ -316,41 +315,207 @@ compareMatrices( if( lda > 0 ) // General case { - for (m = 0; m < M; m++) { - for (n = 0; n < N; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); - gtestAssertElementsEqual(a, b); + for (m = 0; m < M; m++) { + for (n = 0; n < N; n++) { + a = getElement(order, clblasNoTrans, m, n, A, lda); + b = getElement(order, clblasNoTrans, m, n, B, lda); + delta = 0.0; + if (absDelta != NULL) { + delta = absDelta[m * N + n]; } + if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(a, b, delta); } } + } else // Packed case { - if ( order == clblasColumnMajor) - { - for ( n = 0; n < N; n++) - { - for( m=n; m < M; m++) - { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); - gtestAssertElementsEqual(a, b); - } + if ( order == clblasColumnMajor) + { + for ( n = 0; n < N; n++) + { + for( m=n; m < M; m++) + { + a = getElement(order, clblasNoTrans, m, n, A, lda); + b = getElement(order, clblasNoTrans, m, n, B, lda); + delta = 0.0; + if (absDelta != NULL) { + //delta = absDelta[m * N + n]; + } + if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(a, b, delta); + } + } + } + else + { + for ( m = 0; m < M; m++) + { + for( n = 0; n <= m; n++) + { + a = getElement(order, clblasNoTrans, m, n, A, lda); + b = getElement(order, clblasNoTrans, m, n, B, lda); + delta = 0.0; + if (absDelta != NULL) { + //delta = absDelta[m * N + n]; + } + if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(a, b, delta); + } + } + } + } +} + +template<> +__template_static void +compareMatrices( + clblasOrder order, + size_t M, + size_t N, + const FloatComplex *A, + const FloatComplex *B, + size_t lda, + const cl_double *absDelta) +{ + size_t m = 0, n = 0; + FloatComplex a, b; + cl_double delta; + +if ( lda > 0 ) +{ + for (m = 0; m < M; m++) { + for (n = 0; n < N; n++) { + a = getElement(order, clblasNoTrans, m, n, A, lda); + b = getElement(order, clblasNoTrans, m, n, B, lda); + delta = 0.0; + if (absDelta != NULL) { + delta = absDelta[m * N + n]; } + if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(CREAL(a), CREAL(b), delta); + ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); } - else - { - for ( m = 0; m < M; m++) - { - for( n = 0; n <= m; n++) - { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); - gtestAssertElementsEqual(a, b); - } + } +} + else // Packed case + { + if ( order == clblasColumnMajor) + { + for ( n = 0; n < N; n++) + { + for( m=n; m < M; m++) + { + a = getElement(order, clblasNoTrans, m, n, A, lda); + b = getElement(order, clblasNoTrans, m, n, B, lda); + delta = 0.0; + if (absDelta != NULL) { + //delta = absDelta[m * N + n]; + } + if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(CREAL(a), CREAL(b), delta); + ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + } + } + } + else + { + for ( m = 0; m < M; m++) + { + for( n = 0; n <= m; n++) + { + a = getElement(order, clblasNoTrans, m, n, A, lda); + b = getElement(order, clblasNoTrans, m, n, B, lda); + delta = 0.0; + if (absDelta != NULL) { + //delta = absDelta[m * N + n]; + } + if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(CREAL(a), CREAL(b), delta); + ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + } + } + } + } + +} + +template<> +__template_static void +compareMatrices( + clblasOrder order, + size_t M, + size_t N, + const DoubleComplex *A, + const DoubleComplex *B, + size_t lda, + const cl_double *absDelta) +{ + size_t m = 0, n = 0; + DoubleComplex a, b; + cl_double delta; +if( lda > 0 ) +{ + for (m = 0; m < M; m++) { + for (n = 0; n < N; n++) { + a = getElement(order, clblasNoTrans, m, n, A, lda); + b = getElement(order, clblasNoTrans, m, n, B, lda); + delta = 0.0; + if (absDelta != NULL) { + delta = absDelta[m * N + n]; } + if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(CREAL(a), CREAL(b), delta); + ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); } } +} + else // Packed case + { + if ( order == clblasColumnMajor) + { + for ( n = 0; n < N; n++) + { + for( m=n; m < M; m++) + { + a = getElement(order, clblasNoTrans, m, n, A, lda); + b = getElement(order, clblasNoTrans, m, n, B, lda); + delta = 0.0; + if (absDelta != NULL) { + //delta = absDelta[m * N + n]; + } + if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(CREAL(a), CREAL(b), delta); + ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + } + } + } + else + { + for ( m = 0; m < M; m++) + { + for( n = 0; n <= m; n++) + { + a = getElement(order, clblasNoTrans, m, n, A, lda); + b = getElement(order, clblasNoTrans, m, n, B, lda); + delta = 0.0; + if (absDelta != NULL) { + //delta = absDelta[m * N + n]; + } + if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(CREAL(a), CREAL(b), delta); + ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + } + } + } + } + } template From 1e86e348fc1e738dc3307289db07e3b1141527bc Mon Sep 17 00:00:00 2001 From: Kent Knox Date: Thu, 14 Apr 2016 18:04:11 -0500 Subject: [PATCH 28/45] Adding detection for boost 1.60 --- src/CMakeLists.txt | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 249b3d28..7d90f28a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,12 +1,12 @@ # ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -41,33 +41,33 @@ set( OPENCL_OFFLINE_BUILD_TAHITI_KERNEL OFF) #use dynamic generated kernels # MESSAGE(STATUS "Build dynamic Hawaii kernels.") # MESSAGE(STATUS "Check OPENCL_OFFLINE_BUILD_HAWAII_KERNEL to build kernls at compile-time. This will eliminates clBuildProgram() overhead and better kernel performance with certain driver.") - add_definitions(-DCLBLAS_HAWAII_DYNAMIC_KERNEL) + add_definitions(-DCLBLAS_HAWAII_DYNAMIC_KERNEL) #else() # MESSAGE(STATUS "Build static Hawaii kernels.") # MESSAGE(STATUS "Uncheck OPENCL_OFFLINE_BUILD_HAWAII_KERNEL to build kernls at run-time") -# MESSAGE(STATUS "Please ensure the presence of Hawaii device in the system. With certain driver/compiler flags, this might result in compile-time error.") +# MESSAGE(STATUS "Please ensure the presence of Hawaii device in the system. With certain driver/compiler flags, this might result in compile-time error.") #endif( ) #if( NOT OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL ) #use dynamic generated kernels # MESSAGE(STATUS "Build dynamic Bonaire kernels.") # MESSAGE(STATUS "Check OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL to build kernls at compile-time. This will eliminates clBuildProgram() overhead and better kernel performance with certain driver.") - add_definitions(-DCLBLAS_BONAIRE_DYNAMIC_KERNEL) + add_definitions(-DCLBLAS_BONAIRE_DYNAMIC_KERNEL) #else() # MESSAGE(STATUS "Build static Bonaire kernels.") # MESSAGE(STATUS "Uncheck OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL to build kernls at run-time") -# MESSAGE(STATUS "Please ensure the presence of Bonaire device in the system. With certain driver/compiler flags, this might result in compile-time error.") +# MESSAGE(STATUS "Please ensure the presence of Bonaire device in the system. With certain driver/compiler flags, this might result in compile-time error.") #endif( ) #if( NOT OPENCL_OFFLINE_BUILD_TAHITI_KERNEL ) #use dynamic generated kernels # MESSAGE(STATUS "Build dynamic Tahiti kernels.") # MESSAGE(STATUS "Check OPENCL_OFFLINE_BUILD_TAHITI_KERNEL to build kernls at compile-time. This will eliminates clBuildProgram() overhead and better kernel performance with certain driver.") - add_definitions(-DCLBLAS_TAHITI_DYNAMIC_KERNEL) + add_definitions(-DCLBLAS_TAHITI_DYNAMIC_KERNEL) #else( ) # MESSAGE(STATUS "Build static Tahiti kernels.") # MESSAGE(STATUS "Uncheck OPENCL_OFFLINE_BUILD_TAHITI_KERNEL to build kernls at run-time") -# MESSAGE(STATUS "Please ensure the presence of Tahiti device in the system. With certain driver/compiler flags, this might result in compile-time error.") +# MESSAGE(STATUS "Please ensure the presence of Tahiti device in the system. With certain driver/compiler flags, this might result in compile-time error.") #endif( ) @@ -135,8 +135,8 @@ if(NOT CMAKE_BUILD_TYPE) FORCE) endif() -# These variables are meant to contain string which should be appended to the installation paths -# of library and executable binaries, respectively. They are meant to be user configurable/overridable. +# These variables are meant to contain string which should be appended to the installation paths +# of library and executable binaries, respectively. They are meant to be user configurable/overridable. set( SUFFIX_LIB_DEFAULT "" ) set( SUFFIX_BIN_DEFAULT "" ) @@ -170,7 +170,7 @@ if( MSVC_IDE ) endif( ) # add the math library for Linux -if( UNIX ) +if( UNIX ) set(MATH_LIBRARY "m") endif() @@ -220,7 +220,7 @@ if( BUILD_TEST ) else() message(WARNING "Cannot find acml.h") endif() - + if( UNIX ) find_library(ACML_LIBRARIES acml_mp HINTS @@ -238,7 +238,7 @@ if( BUILD_TEST ) ) mark_as_advanced(_acml_mv_library) endif( ) - + if(WIN32) find_library(ACML_LIBRARIES libacml_mp_dll HINTS @@ -248,7 +248,7 @@ if( BUILD_TEST ) $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib ) endif( ) - + if( NOT ACML_LIBRARIES ) message(WARNING "Cannot find libacml") endif( ) @@ -272,8 +272,8 @@ find_package( OpenCL ) set( Boost_USE_MULTITHREADED ON ) set( Boost_USE_STATIC_LIBS ON ) set( Boost_DETAILED_FAILURE_MSG ON ) -set( Boost_DEBUG ON ) -set( Boost_ADDITIONAL_VERSIONS "1.44.0" "1.44" "1.47.0" "1.47" ) +# set( Boost_DEBUG ON ) +set( Boost_ADDITIONAL_VERSIONS "1.44.0" "1.44" "1.47.0" "1.47" "1.60.0" "1.60" ) find_package( Boost 1.33.0 COMPONENTS program_options ) message(STATUS "Boost_PROGRAM_OPTIONS_LIBRARY: ${Boost_PROGRAM_OPTIONS_LIBRARY}") @@ -315,7 +315,7 @@ elseif( MSVC ) # CMake sets huge stack frames for windows, for whatever reason. We go with compiler default. string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}" ) string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}" ) - string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}" ) + string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}" ) endif( ) if (WIN32) @@ -328,13 +328,13 @@ add_definitions( -DCL_USE_DEPRECATED_OPENCL_1_1_APIS ) configure_file( "${PROJECT_SOURCE_DIR}/clBLAS.version.h.in" "${PROJECT_BINARY_DIR}/include/clBLAS.version.h" ) # configure a header file to pass the CMake version settings to the source, and package the header files in the output archive -install( FILES - "clBLAS.h" +install( FILES + "clBLAS.h" "clAmdBlas.h" "clAmdBlas.version.h" "clBLAS-complex.h" "${PROJECT_BINARY_DIR}/include/clBLAS.version.h" - DESTINATION + DESTINATION "./include" ) @@ -359,7 +359,7 @@ if( BUILD_SAMPLE AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/samples" ) add_subdirectory( samples ) endif( ) -# The build server is not supposed to build or package any of the tests; build server script will define this on the command line with +# The build server is not supposed to build or package any of the tests; build server script will define this on the command line with # cmake -G "Visual Studio 10 Win64" -D BUILDSERVER:BOOL=ON ../.. if( BUILD_TEST ) if( IS_DIRECTORY "${PROJECT_SOURCE_DIR}/tests" ) @@ -394,7 +394,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/clBLASConfigVersion.cmake DESTINATION ${destdir}) -# The following code is setting variables to control the behavior of CPack to generate our +# The following code is setting variables to control the behavior of CPack to generate our if( WIN32 ) set( CPACK_SOURCE_GENERATOR "ZIP" ) set( CPACK_GENERATOR "ZIP" ) From 5c0d759cbb902e787986435d2a2953bb03b0216f Mon Sep 17 00:00:00 2001 From: Kent Knox Date: Fri, 15 Apr 2016 11:39:21 -0500 Subject: [PATCH 29/45] Removed ::cerr wrt calling reference and clblas Each test case was printing Calling reference xxx routine... Calling clblas xxx routine... --- src/tests/correctness/corr-asum.cpp | 9 --------- src/tests/correctness/corr-axpy.cpp | 12 ------------ src/tests/correctness/corr-copy.cpp | 11 ----------- src/tests/correctness/corr-dot.cpp | 9 --------- src/tests/correctness/corr-dotc.cpp | 9 --------- src/tests/correctness/corr-gbmv.cpp | 10 ---------- src/tests/correctness/corr-gemm.cpp | 7 +------ src/tests/correctness/corr-gemm2.cpp | 6 ------ src/tests/correctness/corr-gemv.cpp | 10 +--------- src/tests/correctness/corr-ger.cpp | 10 ---------- src/tests/correctness/corr-gerc.cpp | 10 ---------- src/tests/correctness/corr-hbmv.cpp | 10 ---------- src/tests/correctness/corr-hemm.cpp | 9 --------- src/tests/correctness/corr-hemv.cpp | 10 ---------- src/tests/correctness/corr-her.cpp | 7 ------- src/tests/correctness/corr-her2.cpp | 9 --------- src/tests/correctness/corr-her2k.cpp | 7 ------- src/tests/correctness/corr-herk.cpp | 7 ------- src/tests/correctness/corr-hpmv.cpp | 10 ---------- src/tests/correctness/corr-hpr.cpp | 8 -------- src/tests/correctness/corr-hpr2.cpp | 9 --------- src/tests/correctness/corr-iamax.cpp | 9 --------- src/tests/correctness/corr-nrm2.cpp | 9 --------- src/tests/correctness/corr-rot.cpp | 12 ------------ src/tests/correctness/corr-rotg.cpp | 10 ---------- src/tests/correctness/corr-rotm.cpp | 11 ----------- src/tests/correctness/corr-rotmg.cpp | 10 ---------- src/tests/correctness/corr-sbmv.cpp | 10 ---------- src/tests/correctness/corr-scal.cpp | 8 -------- src/tests/correctness/corr-spmv.cpp | 10 ---------- src/tests/correctness/corr-spr.cpp | 11 ----------- src/tests/correctness/corr-spr2.cpp | 9 --------- src/tests/correctness/corr-swap.cpp | 12 ------------ src/tests/correctness/corr-symm.cpp | 8 -------- src/tests/correctness/corr-symv.cpp | 10 +--------- src/tests/correctness/corr-syr.cpp | 12 ------------ src/tests/correctness/corr-syr2.cpp | 10 ---------- src/tests/correctness/corr-syr2k.cpp | 9 +-------- src/tests/correctness/corr-syrk.cpp | 9 +-------- src/tests/correctness/corr-tbmv.cpp | 9 --------- src/tests/correctness/corr-tbsv.cpp | 9 --------- src/tests/correctness/corr-tpmv.cpp | 11 ----------- src/tests/correctness/corr-tpsv.cpp | 9 --------- src/tests/correctness/corr-trmm.cpp | 9 +-------- src/tests/correctness/corr-trmv.cpp | 10 ---------- src/tests/correctness/corr-trsm.cpp | 13 +------------ src/tests/correctness/corr-trsv.cpp | 9 --------- src/tests/include/BlasBase.h | 3 +-- 48 files changed, 8 insertions(+), 442 deletions(-) diff --git a/src/tests/correctness/corr-asum.cpp b/src/tests/correctness/corr-asum.cpp index 81da8e06..90d2b334 100644 --- a/src/tests/correctness/corr-asum.cpp +++ b/src/tests/correctness/corr-asum.cpp @@ -107,20 +107,15 @@ asumCorrectnessTest(TestParams *params) } srand(params->seed); - ::std::cerr << "Generating input data... "; randomVectors(params->N, (blasX + params->offBX), params->incx, (T1*)NULL, 0, true); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufAsum = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T2), 0, CL_MEM_READ_WRITE); scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * sizeof(T1)), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xASUM routine... "; - *blasAsum = ::clMath::blas::asum( params->N, blasX, params->offBX, params->incx); - ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufAsum == NULL) || (scratchBuff == NULL)) { releaseMemObjects(bufX, bufAsum, scratchBuff); @@ -136,8 +131,6 @@ asumCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xASUM routine... "; - DataType type; type = ( typeid(T1) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T1) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T1) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; @@ -163,8 +156,6 @@ asumCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufAsum, CL_TRUE, 0, (1 + params->offa) * sizeof(*clblasAsum), clblasAsum, 0, diff --git a/src/tests/correctness/corr-axpy.cpp b/src/tests/correctness/corr-axpy.cpp index c5816bc7..1aa3246e 100644 --- a/src/tests/correctness/corr-axpy.cpp +++ b/src/tests/correctness/corr-axpy.cpp @@ -111,8 +111,6 @@ axpyCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - // Populate X and Y randomVectors(params->N, (X+params->offBX), params->incx, (Y+params->offCY), params->incy); @@ -120,8 +118,6 @@ axpyCorrectnessTest(TestParams *params) memcpy(blasY, Y, (lengthY + params->offCY) * sizeof(T)); alpha = convertMultiplier(params->alpha); - ::std::cerr << "Done" << ::std::endl; - // Allocate buffers bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(T), 0, CL_MEM_READ_WRITE); @@ -142,14 +138,8 @@ axpyCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling reference xAXPY routine... "; - ::clMath::blas::axpy((size_t)params->N, alpha, blasX, (size_t)params->offBX, params->incx, blasY, (size_t)params->offCY, params->incy); - ::std::cerr << "Done" << ::std::endl; - - - ::std::cerr << "Calling clblas xAXPY routine... "; err = (cl_int)::clMath::clblas::axpy(params->N, alpha, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); @@ -169,8 +159,6 @@ axpyCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(T), Y, 0, NULL, NULL); diff --git a/src/tests/correctness/corr-copy.cpp b/src/tests/correctness/corr-copy.cpp index 2ee46c08..625567f1 100644 --- a/src/tests/correctness/corr-copy.cpp +++ b/src/tests/correctness/corr-copy.cpp @@ -104,22 +104,15 @@ copyCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - // Populate A and blasX randomVectors( params->N, (blasX+params->offBX), params->incx, (blasY+params->offCY), params->incy ); memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); - ::std::cerr << "Done" << ::std::endl; - // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(blasY, (lengthY + params->offCY)* sizeof(*blasY), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xCOPY routine... "; - ::clMath::blas::copy( params->N, blasX, params->offBX, params->incx, blasY, params->offCY, params->incy); - ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is @@ -137,8 +130,6 @@ copyCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xCOPY routine... "; - DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; @@ -162,8 +153,6 @@ copyCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, ((lengthY + params->offCY) * sizeof(*blasY)), clblasY, 0, diff --git a/src/tests/correctness/corr-dot.cpp b/src/tests/correctness/corr-dot.cpp index c4969252..d72b2374 100644 --- a/src/tests/correctness/corr-dot.cpp +++ b/src/tests/correctness/corr-dot.cpp @@ -115,10 +115,8 @@ dotCorrectnessTest(TestParams *params) } srand(params->seed); - ::std::cerr << "Generating input data... "; randomVectors(params->N, (blasX + params->offBX), params->incx, (blasY + params->offCY), params->incy, true); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); @@ -126,10 +124,7 @@ dotCorrectnessTest(TestParams *params) bufDP = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * sizeof(T)), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xDOT routine... "; - *blasDP = ::clMath::blas::dot( params->N, blasX, params->offBX, params->incx, blasY, params->offCY, params->incy); - ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufY == NULL) || (bufDP == NULL) || (scratchBuff == NULL)) { releaseMemObjects(bufX, bufY, bufDP, scratchBuff); @@ -144,8 +139,6 @@ dotCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xDOT routine... "; - DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; @@ -169,8 +162,6 @@ dotCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufDP, CL_TRUE, 0, (1 + params->offa) * sizeof(*clblasDP), clblasDP, 0, diff --git a/src/tests/correctness/corr-dotc.cpp b/src/tests/correctness/corr-dotc.cpp index d4a68b1d..c2bc481b 100644 --- a/src/tests/correctness/corr-dotc.cpp +++ b/src/tests/correctness/corr-dotc.cpp @@ -115,10 +115,8 @@ dotcCorrectnessTest(TestParams *params) } srand(params->seed); - ::std::cerr << "Generating input data... "; randomVectors(params->N, (blasX + params->offBX), params->incx, (blasY + params->offCY), params->incy, true); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); @@ -126,10 +124,7 @@ dotcCorrectnessTest(TestParams *params) bufDP = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * sizeof(T)), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xDOTC routine... "; - *blasDP = ::clMath::blas::dotc( params->N, blasX, params->offBX, params->incx, blasY, params->offCY, params->incy); - ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufY == NULL) || (bufDP == NULL) || (scratchBuff == NULL)) { releaseMemObjects(bufX, bufY, bufDP, scratchBuff); @@ -144,8 +139,6 @@ dotcCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xDOTC routine... "; - DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; @@ -169,8 +162,6 @@ dotcCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufDP, CL_TRUE, 0, (1 + params->offa) * sizeof(*clblasDP), clblasDP, 0, diff --git a/src/tests/correctness/corr-gbmv.cpp b/src/tests/correctness/corr-gbmv.cpp index efa8b4b1..f64fad14 100644 --- a/src/tests/correctness/corr-gbmv.cpp +++ b/src/tests/correctness/corr-gbmv.cpp @@ -112,8 +112,6 @@ gbmvCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers(A, X, blasY, clblasY); @@ -130,15 +128,12 @@ gbmvCorrectnessTest(TestParams *params) (A + params->offA), params->lda, (X+params->offBX), params->incx, (blasY+params->offCY), params->incy ); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xGBMV routine... "; - clblasOrder fOrder; clblasTranspose fTrans; fOrder = params->order; @@ -159,7 +154,6 @@ gbmvCorrectnessTest(TestParams *params) } clMath::blas::gbmv(fOrder, fTrans, fM, fN, fKL, fKU, alpha, A, params->offA, params->lda, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); - ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is @@ -177,8 +171,6 @@ gbmvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xGBMV routine... "; - err = (cl_int)clMath::clblas::gbmv(params->order, params->transA, params->M, params->N, params->KL, params->KU, alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, @@ -199,8 +191,6 @@ gbmvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, diff --git a/src/tests/correctness/corr-gemm.cpp b/src/tests/correctness/corr-gemm.cpp index 5d84983c..61821f4d 100644 --- a/src/tests/correctness/corr-gemm.cpp +++ b/src/tests/correctness/corr-gemm.cpp @@ -77,8 +77,7 @@ gemmCorrectnessTest(TestParams *params) (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { - std::cerr << ">> Test is skipped because it has no importance for this " - "level of coverage" << std::endl; + std::cerr << ">> Test is skipped" << std::endl; SUCCEED(); return; } @@ -111,7 +110,6 @@ gemmCorrectnessTest(TestParams *params) memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); //::std::cerr << "Done" << ::std::endl; - //::std::cerr << "Calling reference xGEMM routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB, params->M, params->N, params->K, alpha, A, @@ -139,7 +137,6 @@ gemmCorrectnessTest(TestParams *params) delete[] reorderedB; delete[] reorderedA; } - //::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), @@ -167,7 +164,6 @@ gemmCorrectnessTest(TestParams *params) return; } - //::std::cerr << "Calling clblas xGEMM routine... "; err = (cl_int)::clMath::clblas::gemm(params->order, params->transA, params->transB, params->M, params->N, params->K, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, beta, @@ -188,7 +184,6 @@ gemmCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - //::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), diff --git a/src/tests/correctness/corr-gemm2.cpp b/src/tests/correctness/corr-gemm2.cpp index 2730d425..116816aa 100644 --- a/src/tests/correctness/corr-gemm2.cpp +++ b/src/tests/correctness/corr-gemm2.cpp @@ -120,14 +120,11 @@ gemm2CorrectnessTest(TestParams *params) beta = convertMultiplier(params->beta); } - ::std::cerr << "Generating input data... "; randomGemmMatrices(params->order, params->transA, params->transB, params->M, params->N, params->K, useAlpha, &alpha, A, params->lda, B, params->ldb, useBeta, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); - ::std::cerr << "Done" << ::std::endl; - ::std::cerr << "Calling reference xGEMM routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB, params->M, params->N, params->K, alpha, A, @@ -162,7 +159,6 @@ gemm2CorrectnessTest(TestParams *params) delete[] reorderedB; delete[] reorderedA; } - ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), @@ -190,7 +186,6 @@ gemm2CorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xGEMM routine... "; err = (cl_int)::clMath::clblas::gemm2(params->order, params->transA, params->transB, params->M, params->N, params->K, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, beta, @@ -211,7 +206,6 @@ gemm2CorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), diff --git a/src/tests/correctness/corr-gemv.cpp b/src/tests/correctness/corr-gemv.cpp index c1a564c3..f3d5b755 100644 --- a/src/tests/correctness/corr-gemv.cpp +++ b/src/tests/correctness/corr-gemv.cpp @@ -77,8 +77,7 @@ gemvCorrectnessTest(TestParams *params) isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { - std::cerr << ">> Test is skipped because it has no importance for this " - "level of coverage" << std::endl; + std::cerr << ">> Test is skipped" << std::endl; SUCCEED(); return; } @@ -115,7 +114,6 @@ gemvCorrectnessTest(TestParams *params) lenY = params->N; } - ::std::cerr << "Generating input data... "; setNans(params->rowsA * params->columnsA, A); setNans(params->rowsB * params->columnsB, B); setNans(params->rowsC * params->columnsC, blasC); @@ -134,9 +132,6 @@ gemvCorrectnessTest(TestParams *params) params->columnsC * params->rowsC); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*clblasC)); - ::std::cerr << "Done" << ::std::endl; - - ::std::cerr << "Calling reference xGEMV routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::gemv(clblasColumnMajor, params->transA, params->M, params->N, alpha, A, params->lda, @@ -153,7 +148,6 @@ gemvCorrectnessTest(TestParams *params) delete[] reorderedA; } - ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), @@ -178,7 +172,6 @@ gemvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xGEMV routine... "; err = (cl_int)::clMath::clblas::gemv(params->order, params->transA, params->M, params->N, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->incx, beta, bufC, params->offCY, @@ -199,7 +192,6 @@ gemvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0, params->rowsC * params->columnsC * sizeof(*clblasC), diff --git a/src/tests/correctness/corr-ger.cpp b/src/tests/correctness/corr-ger.cpp index 1ffe440a..1c132761 100644 --- a/src/tests/correctness/corr-ger.cpp +++ b/src/tests/correctness/corr-ger.cpp @@ -123,8 +123,6 @@ gerCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); @@ -143,10 +141,6 @@ gerCorrectnessTest(TestParams *params) bufy = base->createEnqueueBuffer(y, (lengthy + params->offCY) * sizeof(*y), 0, CL_MEM_READ_ONLY); - ::std::cerr << "Done" << ::std::endl; - ::std::cerr << "Calling reference xGER routine... "; - - clblasOrder fOrder; size_t fN, fM; size_t fOffx, fOffy; @@ -177,7 +171,6 @@ gerCorrectnessTest(TestParams *params) // Call reference blas routine clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy, A, params->offa, params->lda); - ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufx == NULL) || (bufy == NULL)) { /* Skip the test, the most probable reason is @@ -195,8 +188,6 @@ gerCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xGER routine... "; - err = (cl_int)::clMath::clblas::ger( params->order, params->M, params->N, alpha_, bufx, params->offBX, params->incx, bufy, params->offCY, params->incy,bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), 0, NULL, events ); @@ -218,7 +209,6 @@ gerCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa)* sizeof(*backA), backA, 0, diff --git a/src/tests/correctness/corr-gerc.cpp b/src/tests/correctness/corr-gerc.cpp index 0070a778..ec5bfaad 100644 --- a/src/tests/correctness/corr-gerc.cpp +++ b/src/tests/correctness/corr-gerc.cpp @@ -123,8 +123,6 @@ gercCorrectnessTest(TestParams *params) } srand(params->seed); - ::std::cerr << "Generating input data... "; - int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); @@ -143,10 +141,6 @@ gercCorrectnessTest(TestParams *params) bufy = base->createEnqueueBuffer(y, (lengthy + params->offCY) * sizeof(*y), 0, CL_MEM_READ_ONLY); - ::std::cerr << "Done" << ::std::endl; - ::std::cerr << "Calling reference xGER routine... "; - - clblasOrder fOrder; size_t fN, fM; size_t fOffx, fOffy; @@ -180,7 +174,6 @@ gercCorrectnessTest(TestParams *params) else { clMath::blas::gerc(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy, A, params->offa, params->lda); } - ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufx == NULL) || (bufy == NULL)) { /* Skip the test, the most probable reason is @@ -198,8 +191,6 @@ gercCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xGER routine... "; - err = (cl_int)::clMath::clblas::gerc( params->order, params->M, params->N, alpha_, bufx, params->offBX, params->incx, bufy, params->offCY, params->incy,bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), 0, NULL, events ); @@ -220,7 +211,6 @@ gercCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa)* sizeof(*backA), backA, 0, diff --git a/src/tests/correctness/corr-hbmv.cpp b/src/tests/correctness/corr-hbmv.cpp index 9b7ff8e1..accecd79 100644 --- a/src/tests/correctness/corr-hbmv.cpp +++ b/src/tests/correctness/corr-hbmv.cpp @@ -107,8 +107,6 @@ hbmvCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers(A, X, blasY, clblasY); @@ -125,15 +123,12 @@ hbmvCorrectnessTest(TestParams *params) (A + params->offA), params->lda, (X+params->offBX), params->incx, (blasY+params->offCY), params->incy ); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xHBMV routine... "; - clblasOrder fOrder; clblasUplo fUplo; fOrder = params->order; @@ -149,7 +144,6 @@ hbmvCorrectnessTest(TestParams *params) clMath::blas::hbmv(fOrder, fUplo, fN, fK, alpha, A, params->offA, params->lda, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); - ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is @@ -167,8 +161,6 @@ hbmvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xHBMV routine... "; - err = (cl_int)clMath::clblas::hbmv(params->order, params->uplo, params->N, params->K, alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, @@ -189,8 +181,6 @@ hbmvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, diff --git a/src/tests/correctness/corr-hemm.cpp b/src/tests/correctness/corr-hemm.cpp index feb5f2ab..8f540a28 100644 --- a/src/tests/correctness/corr-hemm.cpp +++ b/src/tests/correctness/corr-hemm.cpp @@ -133,8 +133,6 @@ hemmCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... " << std::endl; - int creationFlags = 0, AcreationFlags; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); @@ -153,9 +151,6 @@ hemmCorrectnessTest(TestParams *params) bufB = base->createEnqueueBuffer(B, (lengthB + params->offBX) * sizeof(T), 0, CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(backC, (lengthC + params->offCY) * sizeof(T), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Done" << ::std::endl; - ::std::cerr << "Calling reference xHEMM routine... "; - clblasOrder fOrder; clblasUplo fUplo; clblasSide fSide; @@ -179,7 +174,6 @@ hemmCorrectnessTest(TestParams *params) // Call reference blas routine clMath::blas::hemm(fOrder, fSide, fUplo, fM, fN, alpha_, A, params->offA, params->lda, B, params->offBX, params->ldb, beta_, C, params->offCY, params->ldc); - ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is @@ -197,8 +191,6 @@ hemmCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xHEMM routine... "; - err = (cl_int)::clMath::clblas::hemm( params->order, params->side, params->uplo, params->M, params->N, alpha_, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, beta_, bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events ); @@ -220,7 +212,6 @@ hemmCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0, (lengthC + params->offCY) * sizeof(T), backC, 0, diff --git a/src/tests/correctness/corr-hemv.cpp b/src/tests/correctness/corr-hemv.cpp index 41bcb62b..cced473d 100644 --- a/src/tests/correctness/corr-hemv.cpp +++ b/src/tests/correctness/corr-hemv.cpp @@ -114,8 +114,6 @@ hemvCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers(A, X, blasY, clblasY); @@ -134,7 +132,6 @@ hemvCorrectnessTest(TestParams *params) (X + params->offBX), params->incx, true, &beta, (blasY + params->offCY), params->incy); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); - ::std::cerr << "Done" << ::std::endl; /* printf("\n\n before acml call\nA\n"); printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, A+params->offA); @@ -153,8 +150,6 @@ hemvCorrectnessTest(TestParams *params) //printData( "bufX", blasX, lengthX, 1, lengthX); //printData( "clblasX", clblasX, lengthX, 1, lengthX); - ::std::cerr << "Calling reference xHEMV routine... "; - clblasOrder order; clblasUplo fUplo; @@ -169,7 +164,6 @@ hemvCorrectnessTest(TestParams *params) } ::clMath::blas::hemv( order, fUplo, params->N, alpha, A, params->offA, params->lda, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); - ::std::cerr << "Done" << ::std::endl; /* printf("\n\n after acml call\n"); printf("\nY\n"); @@ -193,8 +187,6 @@ hemvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xHEMV routine... "; - err = (cl_int)::clMath::clblas::hemv(params->order, params->uplo, params->N, alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); @@ -214,8 +206,6 @@ hemvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, diff --git a/src/tests/correctness/corr-her.cpp b/src/tests/correctness/corr-her.cpp index 34b57994..73f5be30 100644 --- a/src/tests/correctness/corr-her.cpp +++ b/src/tests/correctness/corr-her.cpp @@ -101,7 +101,6 @@ herCorrectnessTest(TestParams *params) } srand(params->seed); - ::std::cerr << "Generating input data... "; randomHerMatrices( params->order, params->uplo, params->N, &alpha_, (A + params->offa), params->lda, (X + params->offBX), params->incx ); memcpy(backA, A, (lengthA + params->offa)* sizeof(*A)); ::std::cerr << "Done" << ::std::endl; @@ -110,8 +109,6 @@ herCorrectnessTest(TestParams *params) bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(*A), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX) * sizeof(*X), 0, CL_MEM_READ_ONLY); - ::std::cerr << "Calling reference xHER routine... "; - clblasOrder fOrder; clblasUplo fUplo; fOrder = params->order; @@ -124,7 +121,6 @@ herCorrectnessTest(TestParams *params) fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower; } clMath::blas::her( fOrder, fUplo, params->N, CREAL(alpha_), X , params->offBX, params->incx, A, params->offa, params->lda ); - ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) ) { /* Skip the test, the most probable reason is @@ -151,8 +147,6 @@ herCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xHER routine... "; - err = (cl_int)::clMath::clblas::her( params->order, params->uplo, params->N, CREAL(alpha_), bufX, params->offBX, params->incx, bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), @@ -173,7 +167,6 @@ herCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa) * sizeof(*A), backA, 0, diff --git a/src/tests/correctness/corr-her2.cpp b/src/tests/correctness/corr-her2.cpp index 5d18e1d7..f041420f 100644 --- a/src/tests/correctness/corr-her2.cpp +++ b/src/tests/correctness/corr-her2.cpp @@ -115,8 +115,6 @@ her2CorrectnessTest(TestParams *params) } alpha = convertMultiplier(params->alpha); - ::std::cerr << "Generating input data... "; - randomHer2Matrices(params->order, params->uplo, params->N, &alpha, (blasA + params->offa), params->lda, (X + params->offBX), params->incx, (Y + params->offCY), params->incy); @@ -129,8 +127,6 @@ her2CorrectnessTest(TestParams *params) bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(*Y), 0, CL_MEM_READ_ONLY); - ::std::cerr << "Calling reference xHER2 routine... "; - clblasOrder order; clblasUplo fUplo; order = params->order; @@ -147,7 +143,6 @@ her2CorrectnessTest(TestParams *params) else { ::clMath::blas::her2( order, fUplo, params->N, alpha, X, params->offBX, params->incx, Y, params->offCY, params->incy, blasA, params->offa, params->lda); } - ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is @@ -165,8 +160,6 @@ her2CorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xHER2 routine... "; - err = (cl_int)::clMath::clblas::her2( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), @@ -187,8 +180,6 @@ her2CorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa) * sizeof(*clblasA), clblasA, 0, diff --git a/src/tests/correctness/corr-her2k.cpp b/src/tests/correctness/corr-her2k.cpp index d7db83af..21434ec1 100644 --- a/src/tests/correctness/corr-her2k.cpp +++ b/src/tests/correctness/corr-her2k.cpp @@ -106,8 +106,6 @@ her2kCorrectnessTest(TestParams *params) alpha = convertMultiplier(params->alpha); beta = convertMultiplier(params->beta); - ::std::cerr << "Generating input data... "; - clblasTranspose ftransB = (params->transA==clblasNoTrans)? clblasConjTrans: clblasNoTrans; randomGemmMatrices(params->order, params->transA, ftransB, @@ -115,7 +113,6 @@ her2kCorrectnessTest(TestParams *params) B, params->ldb, true, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); - ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); @@ -141,7 +138,6 @@ her2kCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling reference xHER2K routine... "; T fAlpha = alpha; if (params->order == clblasColumnMajor) { ::clMath::blas::her2k(clblasColumnMajor, params->uplo, params->transA, @@ -158,9 +154,7 @@ her2kCorrectnessTest(TestParams *params) A, 0, params->lda, B, 0, params->ldb, CREAL(beta), blasC, 0, params->ldc); } - ::std::cerr << "Done" << ::std::endl; - ::std::cerr << "Calling clblas xHER2K routine... "; err = (cl_int)::clMath::clblas::her2k(params->order, params->uplo, params->transA, params->N, params->K, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, @@ -183,7 +177,6 @@ her2kCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); diff --git a/src/tests/correctness/corr-herk.cpp b/src/tests/correctness/corr-herk.cpp index 2b5d8ab0..f64eb40b 100644 --- a/src/tests/correctness/corr-herk.cpp +++ b/src/tests/correctness/corr-herk.cpp @@ -118,15 +118,11 @@ herkCorrectnessTest(TestParams *params) alpha = convertMultiplier(params->alpha); beta = convertMultiplier(params->beta); - ::std::cerr << "Generating input data... "; - randomGemmMatrices(params->order, params->transA, clblasNoTrans, params->N, params->N, params->K, useAlpha, &alpha, A, params->lda, NULL, 0, useBeta, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); - ::std::cerr << "Done" << ::std::endl; - ::std::cerr << "Calling reference xHERK routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::herk(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, CREAL(alpha), A, params->lda, @@ -158,7 +154,6 @@ herkCorrectnessTest(TestParams *params) A, params->lda, CREAL(beta), blasC, params->ldc); } - ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), @@ -184,7 +179,6 @@ herkCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xHERK routine... "; err = (cl_int)::clMath::clblas::herk(params->order, params->uplo, params->transA, params->N, params->K, CREAL(alpha), bufA, params->offA, params->lda, @@ -207,7 +201,6 @@ herkCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), diff --git a/src/tests/correctness/corr-hpmv.cpp b/src/tests/correctness/corr-hpmv.cpp index 95317741..5da7828b 100644 --- a/src/tests/correctness/corr-hpmv.cpp +++ b/src/tests/correctness/corr-hpmv.cpp @@ -105,8 +105,6 @@ hpmvCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - if((AP == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers(AP, X, blasY, clblasY); @@ -123,15 +121,12 @@ hpmvCorrectnessTest(TestParams *params) (X + params->offBX), params->incx, true, &beta, (blasY + params->offCY), params->incy); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(AP, (lengthA + params->offA)* sizeof(*AP), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xHPMV routine... "; - clblasOrder order; clblasUplo fUplo; @@ -146,7 +141,6 @@ hpmvCorrectnessTest(TestParams *params) } ::clMath::blas::hpmv( order, fUplo, params->N, alpha, AP, params->offA, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); - ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is @@ -164,8 +158,6 @@ hpmvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xHPMV routine... "; - err = (cl_int)::clMath::clblas::hpmv(params->order, params->uplo, params->N, alpha, bufAP, params->offA, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); @@ -185,8 +177,6 @@ hpmvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, diff --git a/src/tests/correctness/corr-hpr.cpp b/src/tests/correctness/corr-hpr.cpp index 7a513c30..4cb539f4 100644 --- a/src/tests/correctness/corr-hpr.cpp +++ b/src/tests/correctness/corr-hpr.cpp @@ -100,17 +100,13 @@ hprCorrectnessTest(TestParams *params) } srand(params->seed); - ::std::cerr << "Generating input data... "; randomHerMatrices( params->order, params->uplo, params->N, &alpha_, (AP + params->offa), params->lda, (X + params->offBX), params->incx ); memcpy(backA, AP, (lengthAP + params->offa)* sizeof(T)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(AP, (lengthAP + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX) * sizeof(*X), 0, CL_MEM_READ_ONLY); - ::std::cerr << "Calling reference xHPR routine... "; - clblasOrder fOrder; clblasUplo fUplo; fOrder = params->order; @@ -123,7 +119,6 @@ hprCorrectnessTest(TestParams *params) fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower; } clMath::blas::hpr( fOrder, fUplo, params->N, CREAL(alpha_), X , params->offBX, params->incx, AP, params->offa); - ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) ) { /* Skip the test, the most probable reason is @@ -150,8 +145,6 @@ hprCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xHPR routine... "; - err = (cl_int)::clMath::clblas::hpr( params->order, params->uplo, params->N, CREAL(alpha_), bufX, params->offBX, params->incx, bufAP, params->offa, params->numCommandQueues, base->commandQueues(), @@ -172,7 +165,6 @@ hprCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0, (lengthAP + params->offa) * sizeof(T), backA, 0, diff --git a/src/tests/correctness/corr-hpr2.cpp b/src/tests/correctness/corr-hpr2.cpp index 313f167d..0ba272a1 100644 --- a/src/tests/correctness/corr-hpr2.cpp +++ b/src/tests/correctness/corr-hpr2.cpp @@ -115,21 +115,17 @@ hpr2CorrectnessTest(TestParams *params) } alpha = convertMultiplier(params->alpha); - ::std::cerr << "Generating input data... "; randomHer2Matrices(params->order, params->uplo, params->N, &alpha, (blasAP + params->offa), params->lda, (X + params->offBX), params->incx, (Y + params->offCY), params->incy); // Copy blasA to clblasA memcpy(clblasAP, blasAP, (lengthAP + params->offa)* sizeof(*blasAP)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(clblasAP, (lengthAP + params->offa)* sizeof(*clblasAP), 0,CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(*Y), 0, CL_MEM_READ_ONLY); - ::std::cerr << "Calling reference xHPR2 routine... "; - clblasOrder order; clblasUplo fUplo; order = params->order; @@ -146,7 +142,6 @@ hpr2CorrectnessTest(TestParams *params) else { ::clMath::blas::hpr2( order, fUplo, params->N, alpha, X, params->offBX, params->incx, Y, params->offCY, params->incy, blasAP, params->offa); } - ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is @@ -164,8 +159,6 @@ hpr2CorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xHPR2 routine... "; - err = (cl_int)::clMath::clblas::hpr2( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, bufAP, params->offa, params->numCommandQueues, base->commandQueues(), @@ -186,8 +179,6 @@ hpr2CorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0, (lengthAP + params->offa) * sizeof(*clblasAP), clblasAP, 0, diff --git a/src/tests/correctness/corr-iamax.cpp b/src/tests/correctness/corr-iamax.cpp index 81f2bd32..040b918f 100644 --- a/src/tests/correctness/corr-iamax.cpp +++ b/src/tests/correctness/corr-iamax.cpp @@ -106,20 +106,15 @@ iamaxCorrectnessTest(TestParams *params) } srand(params->seed); - ::std::cerr << "Generating input data... "; randomVectors(params->N, (blasX + params->offBX), params->incx, NULL, 0); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_READ_ONLY); bufiAmax = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(int), 0, CL_MEM_READ_WRITE); scratchBuff = base->createEnqueueBuffer(NULL, (2 * lengthX * sizeof(T)), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xiAMAX routine... "; - *blasiAmax = ::clMath::blas::iamax( params->N, blasX, params->offBX, params->incx); - ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufiAmax == NULL) || (scratchBuff == NULL)) { releaseMemObjects(bufX, bufiAmax, scratchBuff); @@ -134,8 +129,6 @@ iamaxCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xiAMAX routine... "; - DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; @@ -159,8 +152,6 @@ iamaxCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufiAmax, CL_TRUE, 0, (1 + params->offa) * sizeof(*clblasiAmax), clblasiAmax, 0, NULL, NULL); diff --git a/src/tests/correctness/corr-nrm2.cpp b/src/tests/correctness/corr-nrm2.cpp index 588ee825..34bce308 100644 --- a/src/tests/correctness/corr-nrm2.cpp +++ b/src/tests/correctness/corr-nrm2.cpp @@ -109,20 +109,15 @@ nrm2CorrectnessTest(TestParams *params) } srand(params->seed); - ::std::cerr << "Generating input data... "; randomVectors(params->N, (blasX + params->offBX), params->incx, (T1*)NULL, 0, true); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufNRM2 = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T2), 0, CL_MEM_READ_WRITE); scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * 2 * sizeof(T1)), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xNRM2 routine... "; - *blasNRM2 = ::clMath::blas::nrm2( params->N, blasX, params->offBX, params->incx); - ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufNRM2 == NULL) || (scratchBuff == NULL)) { releaseMemObjects(bufX, bufNRM2, scratchBuff); @@ -138,8 +133,6 @@ nrm2CorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xNRM2 routine... "; - DataType type; type = ( typeid(T1) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T1) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T1) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; @@ -163,8 +156,6 @@ nrm2CorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufNRM2, CL_TRUE, 0, (1 + params->offa) * sizeof(*clblasNRM2), clblasNRM2, 0, NULL, NULL); diff --git a/src/tests/correctness/corr-rot.cpp b/src/tests/correctness/corr-rot.cpp index c9df97ac..3e09c94a 100644 --- a/src/tests/correctness/corr-rot.cpp +++ b/src/tests/correctness/corr-rot.cpp @@ -112,8 +112,6 @@ rotCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - //Filling random values for SA and SB. C & S are only for output sake randomVectors(params->N, (X + params->offa), params->incx, (Y+params->offb), params->incy); @@ -123,17 +121,12 @@ rotCorrectnessTest(TestParams *params) memcpy(back_X, X, (lengthx + params->offa) * sizeof(T)); memcpy(back_Y, Y, (lengthy + params->offb) * sizeof(T)); - ::std::cerr << "Done" << ::std::endl; - // Allocate buffers bufX = base->createEnqueueBuffer(X, (lengthx + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(Y, (lengthy + params->offb) * sizeof(T), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xROT routine... "; - ::clMath::blas::rot(params->N, back_X, params->offa, params->incx, back_Y, params->offb, params->incy, alpha, beta); - ::std::cerr << "Done" << ::std::endl; // Hold X vector @@ -151,9 +144,6 @@ rotCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xROT routine... "; - - err = (cl_int)::clMath::clblas::rot( params->N, bufX, params->offa, params->incx, bufY, params->offb, params->incy, alpha, beta, params->numCommandQueues, base->commandQueues(), 0, NULL, events); @@ -174,8 +164,6 @@ rotCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthx + params->offa) * sizeof(T), X, 0, NULL, NULL); diff --git a/src/tests/correctness/corr-rotg.cpp b/src/tests/correctness/corr-rotg.cpp index 21ef905b..4616533a 100644 --- a/src/tests/correctness/corr-rotg.cpp +++ b/src/tests/correctness/corr-rotg.cpp @@ -135,8 +135,6 @@ rotgCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - //Filling random values for SA and SB. C & S are only for output sake randomVectors(1, (SA+params->offBX), 1, (SB+params->offCY), 1); S[params->offb] = back_S[params->offb] = ZERO(); @@ -144,7 +142,6 @@ rotgCorrectnessTest(TestParams *params) back_SA[params->offBX] = SA[params->offBX]; back_SB[params->offCY] = SB[params->offCY]; - ::std::cerr << "Done" << ::std::endl; //printing the inputs, as they change after processing ::std::cerr << "A = "; @@ -163,10 +160,7 @@ rotgCorrectnessTest(TestParams *params) bufC = base->createEnqueueBuffer(C, (length + params->offa ) * sizeof(T2), 0, CL_MEM_WRITE_ONLY); bufS = base->createEnqueueBuffer(S, (length + params->offb ) * sizeof(T1), 0, CL_MEM_WRITE_ONLY); - ::std::cerr << "Calling reference xROTG routine... "; - ::clMath::blas::rotg(back_SA, params->offBX, back_SB, params->offCY, back_C, params->offa, back_S, params->offb); - ::std::cerr << "Done" << ::std::endl; // Hold X vector @@ -185,8 +179,6 @@ rotgCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xROTG routine... "; - DataType type; type = ( typeid(T1) == typeid(cl_float)) ? TYPE_FLOAT : ( typeid(T1) == typeid(cl_double)) ? TYPE_DOUBLE: @@ -214,8 +206,6 @@ rotgCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufSA, CL_TRUE, 0, (length + params->offBX) * sizeof(T1), SA, 0, NULL, NULL); diff --git a/src/tests/correctness/corr-rotm.cpp b/src/tests/correctness/corr-rotm.cpp index 4a1a02e2..33f76485 100644 --- a/src/tests/correctness/corr-rotm.cpp +++ b/src/tests/correctness/corr-rotm.cpp @@ -126,8 +126,6 @@ rotmCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - randomVectors(params->N, (X + params->offa), params->incx, (Y+params->offb), params->incy); randomVectors(4, (PARAM + params->offc + 1), 1); //1st element is initialized separately @@ -138,18 +136,13 @@ rotmCorrectnessTest(TestParams *params) memcpy(back_Y, Y, (lengthy + params->offb)*sizeof(T)); memcpy(back_PARAM, PARAM, (params->offc + 5)*sizeof(T)); - ::std::cerr << "Done" << ::std::endl; - // Allocate buffers bufX = base->createEnqueueBuffer(X, (lengthx + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(Y, (lengthy + params->offb) * sizeof(T), 0, CL_MEM_READ_WRITE); bufParam = base->createEnqueueBuffer(PARAM, (5 + params->offc) * sizeof(T), 0, CL_MEM_READ_ONLY); - ::std::cerr << "Calling reference xROTM routine... "; - ::clMath::blas::rotm(params->N, back_X, params->offa, params->incx, back_Y, params->offb, params->incy, back_PARAM, params->offc); - ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufY == NULL) || (bufParam == NULL)) { @@ -165,8 +158,6 @@ rotmCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xROTM routine... "; - DataType type; type = ( typeid(T) == typeid(cl_float)) ? TYPE_FLOAT : TYPE_DOUBLE; @@ -191,8 +182,6 @@ rotmCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthx + params->offa) * sizeof(T), X, 0, NULL, NULL); diff --git a/src/tests/correctness/corr-rotmg.cpp b/src/tests/correctness/corr-rotmg.cpp index 851310c9..8c448feb 100644 --- a/src/tests/correctness/corr-rotmg.cpp +++ b/src/tests/correctness/corr-rotmg.cpp @@ -134,8 +134,6 @@ rotmgCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - //Filling random values for SA and SB. C & S are only for output sake randomRotmg( (D1 + params->offa), (D2 + params->offb), (X + params->offBX), (Y + params->offCY), (PARAM + params->offc) ); @@ -149,8 +147,6 @@ rotmgCorrectnessTest(TestParams *params) memcpy(back_D2, D2, (1 + params->offb)*sizeof(T)); memcpy(back_PARAM, PARAM, (params->offc + 5)*sizeof(T)); - ::std::cerr << "Done" << ::std::endl; - // Allocate buffers bufD1 = base->createEnqueueBuffer(D1, (1 + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); bufD2 = base->createEnqueueBuffer(D2, (1 + params->offb) * sizeof(T), 0, CL_MEM_READ_WRITE); @@ -158,11 +154,8 @@ rotmgCorrectnessTest(TestParams *params) bufY = base->createEnqueueBuffer(Y, (1 + params->offCY) * sizeof(T), 0, CL_MEM_READ_ONLY); bufParam = base->createEnqueueBuffer(PARAM, (5 + params->offc) * sizeof(T), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xROTMG routine... "; - ::clMath::blas::rotmg(back_D1, params->offa, back_D2, params->offb, back_X, params->offBX, back_Y, params->offCY, back_PARAM, params->offc); - ::std::cerr << "Done" << ::std::endl; // Hold X vector @@ -181,8 +174,6 @@ rotmgCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xROTMG routine... "; - DataType type; type = ( typeid(T) == typeid(cl_float)) ? TYPE_FLOAT : TYPE_DOUBLE; @@ -210,7 +201,6 @@ rotmgCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufD1, CL_TRUE, 0, (1 + params->offa) * sizeof(T), D1, 0, NULL, NULL); diff --git a/src/tests/correctness/corr-sbmv.cpp b/src/tests/correctness/corr-sbmv.cpp index b17f6699..fc1eacdb 100644 --- a/src/tests/correctness/corr-sbmv.cpp +++ b/src/tests/correctness/corr-sbmv.cpp @@ -106,8 +106,6 @@ sbmvCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers(A, X, blasY, clblasY); @@ -124,15 +122,12 @@ sbmvCorrectnessTest(TestParams *params) (A + params->offA), params->lda, (X+params->offBX), params->incx, (blasY+params->offCY), params->incy ); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xSBMV routine... "; - clblasOrder fOrder; clblasUplo fUplo; fOrder = params->order; @@ -148,7 +143,6 @@ sbmvCorrectnessTest(TestParams *params) clMath::blas::sbmv(fOrder, fUplo, fN, fK, alpha, A, params->offA, params->lda, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); - ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is @@ -166,8 +160,6 @@ sbmvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xSBMV routine... "; - err = (cl_int)clMath::clblas::sbmv(params->order, params->uplo, params->N, params->K, alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, @@ -188,8 +180,6 @@ sbmvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, diff --git a/src/tests/correctness/corr-scal.cpp b/src/tests/correctness/corr-scal.cpp index ad156f63..ecd8b829 100644 --- a/src/tests/correctness/corr-scal.cpp +++ b/src/tests/correctness/corr-scal.cpp @@ -95,19 +95,14 @@ void scalCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - randomVectors(params->N, (blasX+params->offBX), params->incx); alpha = convertMultiplier(params->alpha); memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX)); - ::std::cerr << "Done" << ::std::endl; bufX = base->createEnqueueBuffer(clblasX, (lengthX + params->offBX)* sizeof(*clblasX), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xSCAL routine... "; // Both blas and clBlas wrapper functions consider the real part of alpha in case of css/zdscal // This is to make sure both get the same scalar alpha. check wrapper functions ::clMath::blas::scal(is_css_zds, params->N, alpha, blasX, params->offBX, params->incx); - ::std::cerr << "Done" << ::std::endl; if (bufX == NULL) { /* Skip the test, the most probable reason is @@ -125,7 +120,6 @@ void scalCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xSCAL routine... "; // Both blas and clBlas wrapper functions consider the real part of alpha in case of css/zdscal // This is to make sure both get the same scalar alpha. check wrapper functions err = (cl_int)::clMath::clblas::scal(is_css_zds, params->N, alpha, bufX, params->offBX, @@ -145,8 +139,6 @@ void scalCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, NULL, NULL); diff --git a/src/tests/correctness/corr-spmv.cpp b/src/tests/correctness/corr-spmv.cpp index dcbad3b5..52aded01 100644 --- a/src/tests/correctness/corr-spmv.cpp +++ b/src/tests/correctness/corr-spmv.cpp @@ -105,8 +105,6 @@ spmvCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - if((AP == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers(AP, X, blasY, clblasY); @@ -123,15 +121,12 @@ spmvCorrectnessTest(TestParams *params) (X + params->offBX), params->incx, true, &beta, (blasY + params->offCY), params->incy); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(AP, (lengthA + params->offA)* sizeof(*AP), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xSPMV routine... "; - clblasOrder order; clblasUplo fUplo; @@ -145,7 +140,6 @@ spmvCorrectnessTest(TestParams *params) } ::clMath::blas::spmv( order, fUplo, params->N, alpha, AP, params->offA, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); - ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is @@ -163,8 +157,6 @@ spmvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xSPMV routine... "; - err = (cl_int)::clMath::clblas::spmv(params->order, params->uplo, params->N, alpha, bufAP, params->offA, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); @@ -184,8 +176,6 @@ spmvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, diff --git a/src/tests/correctness/corr-spr.cpp b/src/tests/correctness/corr-spr.cpp index 4b00a02b..4851e7de 100644 --- a/src/tests/correctness/corr-spr.cpp +++ b/src/tests/correctness/corr-spr.cpp @@ -99,9 +99,6 @@ sprCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - - memset(blasAP, -1, (lengthAP + params->offa)); memset(clblasAP, -1, (lengthAP + params->offa)); memset(X, -1, (lengthX + params->offBX)); @@ -127,13 +124,9 @@ sprCorrectnessTest(TestParams *params) memcpy(clblasAP, blasAP, (lengthAP + params->offa)* sizeof(*blasAP)); - ::std::cerr << "Done" << ::std::endl; - bufAP = base->createEnqueueBuffer(clblasAP, (lengthAP + params->offa) * sizeof(*clblasAP), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); - ::std::cerr << "Calling reference xSPR routine... "; - clblasOrder order; clblasUplo fUplo; order = params->order; @@ -152,7 +145,6 @@ sprCorrectnessTest(TestParams *params) } clMath::blas::spr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, blasAP, params->offa); - ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) ) { /* Skip the test, the most probable reason is @@ -170,8 +162,6 @@ sprCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xSPR routine... "; - err = (cl_int)::clMath::clblas::spr( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufAP, params->offa, params->numCommandQueues, base->commandQueues(), @@ -192,7 +182,6 @@ sprCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0, (lengthAP + params->offa) * sizeof(*clblasAP), clblasAP, 0, diff --git a/src/tests/correctness/corr-spr2.cpp b/src/tests/correctness/corr-spr2.cpp index c000e64c..f852daa1 100644 --- a/src/tests/correctness/corr-spr2.cpp +++ b/src/tests/correctness/corr-spr2.cpp @@ -111,21 +111,17 @@ spr2CorrectnessTest(TestParams *params) alpha = convertMultiplier(params->alpha); useAlpha = true; - ::std::cerr << "Generating input data... "; randomSyr2Matrices(params->order, params->uplo, params->N, useAlpha, &alpha, (blasAP + params->offa), params->lda, (X + params->offBX), params->incx, (Y + params->offCY), params->incy); // Copy blasAP to clblasAP memcpy(clblasAP, blasAP, (lengthAP + params->offa)* sizeof(*blasAP)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(clblasAP, (lengthAP + params->offa)* sizeof(*clblasAP), 0,CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(*Y), 0, CL_MEM_READ_ONLY); - ::std::cerr << "Calling reference xSPR2 routine... "; - clblasOrder order; clblasUplo fUplo; @@ -140,7 +136,6 @@ spr2CorrectnessTest(TestParams *params) ::clMath::blas::spr2( order, fUplo, params->N, alpha, X, params->offBX, params->incx, Y, params->offCY, params->incy, blasAP, params->offa); - ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is @@ -158,8 +153,6 @@ spr2CorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xSPR2 routine... "; - err = (cl_int)::clMath::clblas::spr2( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, bufAP, params->offa, params->numCommandQueues, base->commandQueues(), @@ -180,8 +173,6 @@ spr2CorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0, (lengthAP + params->offa) * sizeof(*clblasAP), clblasAP, 0, diff --git a/src/tests/correctness/corr-swap.cpp b/src/tests/correctness/corr-swap.cpp index e62f88a1..db1eb6d5 100644 --- a/src/tests/correctness/corr-swap.cpp +++ b/src/tests/correctness/corr-swap.cpp @@ -109,16 +109,12 @@ swapCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - // Populate A and blasX randomVectors(params->N, (X+params->offBX), params->incx, (Y+params->offCY), params->incy); memcpy(blasX, X, (lengthX + params->offBX) * sizeof(T)); memcpy(blasY, Y, (lengthY + params->offCY) * sizeof(T)); - ::std::cerr << "Done" << ::std::endl; - // Allocate buffers bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(T), 0, CL_MEM_READ_WRITE); @@ -139,14 +135,8 @@ swapCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling reference xSWAP routine... "; - ::clMath::blas::swap( params->N, blasX, params->offBX, params->incx, blasY, params->offCY, params->incy); - ::std::cerr << "Done" << ::std::endl; - - - ::std::cerr << "Calling clblas xSWAP routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : (( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: (( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE)); @@ -169,8 +159,6 @@ swapCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(T), X, 0, NULL, NULL); diff --git a/src/tests/correctness/corr-symm.cpp b/src/tests/correctness/corr-symm.cpp index cb747689..4af106dd 100644 --- a/src/tests/correctness/corr-symm.cpp +++ b/src/tests/correctness/corr-symm.cpp @@ -127,7 +127,6 @@ symmCorrectnessTest(TestParams *params) return; } srand(params->seed); - ::std::cerr << "Generating input data... "; int creationFlags = 0, AcreationFlags; creationFlags = creationFlags | RANDOM_INIT; @@ -158,9 +157,6 @@ symmCorrectnessTest(TestParams *params) bufB = base->createEnqueueBuffer(B, (lengthB + params->offb) * sizeof(T), 0, CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(backC, (lengthC + params->offc) * sizeof(T), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Done" << ::std::endl; - ::std::cerr << "Calling reference xSYMM routine... "; - clblasOrder fOrder; clblasUplo fUplo; clblasSide fSide; @@ -184,7 +180,6 @@ symmCorrectnessTest(TestParams *params) // Call reference blas routine clMath::blas::symm(fOrder, fSide, fUplo, fM, fN, alpha_, A, params->offa, params->lda, B, params->offb, params->ldb, beta_, C, params->offc, params->ldc); - ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is @@ -202,8 +197,6 @@ symmCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xSYMM routine... "; - err = (cl_int)::clMath::clblas::symm( params->order, params->side, params->uplo, params->M, params->N, alpha_, bufA, params->offa, params->lda, bufB, params->offb, params->ldb, beta_, bufC, params->offc, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events ); @@ -225,7 +218,6 @@ symmCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0, (lengthC + params->offc) * sizeof(T), backC, 0, diff --git a/src/tests/correctness/corr-symv.cpp b/src/tests/correctness/corr-symv.cpp index b5fb4201..d4c33c65 100644 --- a/src/tests/correctness/corr-symv.cpp +++ b/src/tests/correctness/corr-symv.cpp @@ -76,8 +76,7 @@ symvCorrectnessTest(TestParams *params) isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { - std::cerr << ">> Test is skipped because it has no importance for this " - "level of coverage" << std::endl; + std::cerr << ">> Test is skipped" << std::endl; SUCCEED(); return; } @@ -106,7 +105,6 @@ symvCorrectnessTest(TestParams *params) beta = convertMultiplier(params->beta); } - ::std::cerr << "Generating input data... "; setNans(params->rowsA * params->columnsA, A); setNans(params->rowsB * params->columnsB, B); setNans(params->rowsC * params->columnsC, blasC); @@ -125,9 +123,6 @@ symvCorrectnessTest(TestParams *params) setVectorNans(params->offCY, abs(params->incy), blasC, params->N, params->columnsC * params->rowsC); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*clblasC)); - ::std::cerr << "Done" << ::std::endl; - - ::std::cerr << "Calling reference xSYMV routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::symv(clblasColumnMajor, params->uplo, @@ -145,7 +140,6 @@ symvCorrectnessTest(TestParams *params) delete[] reorderedA; } - ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), @@ -170,7 +164,6 @@ symvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xSYMV routine... "; err = (cl_int)::clMath::clblas::symv(params->order, params->uplo, params->N, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->incx, beta, bufC, params->offCY, params->incy, @@ -191,7 +184,6 @@ symvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0, params->rowsC * params->columnsC * sizeof(*clblasC), diff --git a/src/tests/correctness/corr-syr.cpp b/src/tests/correctness/corr-syr.cpp index 12967c9e..cae06fcc 100644 --- a/src/tests/correctness/corr-syr.cpp +++ b/src/tests/correctness/corr-syr.cpp @@ -99,9 +99,6 @@ syrCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - - memset(blasA, -1, (lengthA + params->offa)); memset(clblasA, -1, (lengthA + params->offa)); memset(X, -1, (lengthX + params->offBX)); @@ -142,14 +139,10 @@ syrCorrectnessTest(TestParams *params) memcpy(clblasA, blasA, (lengthA + params->offa)* sizeof(*blasA)); // memcpy(tempA, blasA, (lengthA + params->offa)* sizeof(*blasA)); - ::std::cerr << "Done" << ::std::endl; - // Allocate buffers bufA = base->createEnqueueBuffer(clblasA, (lengthA + params->offa) * sizeof(*clblasA), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); - ::std::cerr << "Calling reference xSYR routine... "; - clblasOrder order; clblasUplo fUplo; @@ -183,8 +176,6 @@ syrCorrectnessTest(TestParams *params) //printf("After acml\n"); //printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, blasA); - ::std::cerr << "Done" << ::std::endl; - if ((bufA == NULL) || (bufX == NULL) ) { /* Skip the test, the most probable reason is * matrix too big for a device. @@ -201,8 +192,6 @@ syrCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xSYR routine... "; - err = (cl_int)::clMath::clblas::syr( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), @@ -223,7 +212,6 @@ syrCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa) * sizeof(*clblasA), clblasA, 0, diff --git a/src/tests/correctness/corr-syr2.cpp b/src/tests/correctness/corr-syr2.cpp index 4148ed56..303cce58 100644 --- a/src/tests/correctness/corr-syr2.cpp +++ b/src/tests/correctness/corr-syr2.cpp @@ -111,22 +111,17 @@ syr2CorrectnessTest(TestParams *params) alpha = convertMultiplier(params->alpha); useAlpha = true; - ::std::cerr << "Generating input data... "; - randomSyr2Matrices(params->order, params->uplo, params->N, useAlpha, &alpha, (blasA + params->offa), params->lda, (X + params->offBX), params->incx, (Y + params->offCY), params->incy); // Copy blasA to clblasA memcpy(clblasA, blasA, (lengthA + params->offa)* sizeof(*blasA)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(clblasA, (lengthA + params->offa)* sizeof(*clblasA), 0,CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(*Y), 0, CL_MEM_READ_ONLY); - ::std::cerr << "Calling reference xSYR2 routine... "; - clblasOrder order; clblasUplo fUplo; @@ -141,7 +136,6 @@ syr2CorrectnessTest(TestParams *params) ::clMath::blas::syr2( order, fUplo, params->N, alpha, X, params->offBX, params->incx, Y, params->offCY, params->incy, blasA, params->offa, params->lda); - ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is @@ -159,8 +153,6 @@ syr2CorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xSYR2 routine... "; - err = (cl_int)::clMath::clblas::syr2( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), @@ -181,8 +173,6 @@ syr2CorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa) * sizeof(*clblasA), clblasA, 0, diff --git a/src/tests/correctness/corr-syr2k.cpp b/src/tests/correctness/corr-syr2k.cpp index d42c7f4c..1c05161e 100644 --- a/src/tests/correctness/corr-syr2k.cpp +++ b/src/tests/correctness/corr-syr2k.cpp @@ -88,8 +88,7 @@ syr2kCorrectnessTest(TestParams *params) isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { - std::cerr << ">> Test is skipped because it has no importance for this " - "level of coverage" << std::endl; + std::cerr << ">> Test is skipped" << std::endl; SUCCEED(); return; } @@ -115,7 +114,6 @@ syr2kCorrectnessTest(TestParams *params) beta = convertMultiplier(params->beta); } - ::std::cerr << "Generating input data... "; if (!useAlpha) { alpha = random(100); if (module(alpha) == 0.0) { @@ -130,9 +128,7 @@ syr2kCorrectnessTest(TestParams *params) params->N, params->N, params->K, true, &a, A, params->lda, B, params->ldb, useBeta, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); - ::std::cerr << "Done" << ::std::endl; - ::std::cerr << "Calling reference xSYR2K routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::syr2k(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, alpha, A, @@ -160,7 +156,6 @@ syr2kCorrectnessTest(TestParams *params) delete[] reorderedB; delete[] reorderedA; } - ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), @@ -188,7 +183,6 @@ syr2kCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xSYR2K routine... "; err = (cl_int)::clMath::clblas::syr2k(params->order, params->uplo, params->transA, params->N, params->K, alpha, bufA, params->offA, @@ -213,7 +207,6 @@ syr2kCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), diff --git a/src/tests/correctness/corr-syrk.cpp b/src/tests/correctness/corr-syrk.cpp index 8977718d..709ad49a 100644 --- a/src/tests/correctness/corr-syrk.cpp +++ b/src/tests/correctness/corr-syrk.cpp @@ -85,8 +85,7 @@ syrkCorrectnessTest(TestParams *params) isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { - std::cerr << ">> Test is skipped because it has no importance for this " - "level of coverage" << std::endl; + std::cerr << ">> Test is skipped" << std::endl; SUCCEED(); return; } @@ -111,7 +110,6 @@ syrkCorrectnessTest(TestParams *params) beta = convertMultiplier(params->beta); } - ::std::cerr << "Generating input data... "; if (!useAlpha) { alpha = random(100); if (module(alpha) == 0.0) { @@ -123,9 +121,7 @@ syrkCorrectnessTest(TestParams *params) params->N, params->N, params->K, useAlpha, &alpha, A, params->lda, NULL, 0, useBeta, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); - ::std::cerr << "Done" << ::std::endl; - ::std::cerr << "Calling reference xSYRK routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::syrk(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, alpha, A, params->lda, @@ -149,7 +145,6 @@ syrkCorrectnessTest(TestParams *params) delete[] reorderedC; delete[] reorderedA; } - ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), @@ -174,7 +169,6 @@ syrkCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xSYRK routine... "; err = (cl_int)::clMath::clblas::syrk(params->order, params->uplo, params->transA, params->N, params->K, alpha, bufA, params->offA, params->lda, @@ -197,7 +191,6 @@ syrkCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), diff --git a/src/tests/correctness/corr-tbmv.cpp b/src/tests/correctness/corr-tbmv.cpp index 7b438ffe..5b37ee95 100644 --- a/src/tests/correctness/corr-tbmv.cpp +++ b/src/tests/correctness/corr-tbmv.cpp @@ -100,8 +100,6 @@ tbmvCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - if((A == NULL) || (blasX == NULL) || (clblasX == NULL)) { deleteBuffers(A, blasX, clblasX); @@ -114,15 +112,12 @@ tbmvCorrectnessTest(TestParams *params) // Copy blasY to clblasY memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufXtemp = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xTBMV routine... "; - clblasOrder fOrder; clblasTranspose fTrans; clblasUplo fUplo; @@ -142,7 +137,6 @@ tbmvCorrectnessTest(TestParams *params) } clMath::blas::tbmv(fOrder, fUplo, fTrans, params->diag, fN, fK, A, params->offA, params->lda, blasX, params->offBX, params->incx); - ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL)|| (bufXtemp == NULL)) { // Skip the test, the most probable reason is @@ -160,7 +154,6 @@ tbmvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xTBMV routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT:( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; @@ -184,8 +177,6 @@ tbmvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, diff --git a/src/tests/correctness/corr-tbsv.cpp b/src/tests/correctness/corr-tbsv.cpp index 17c59f3f..36947c04 100644 --- a/src/tests/correctness/corr-tbsv.cpp +++ b/src/tests/correctness/corr-tbsv.cpp @@ -102,8 +102,6 @@ tbsvCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - if((A == NULL) || (blasX == NULL) || (clblasX == NULL)) { deleteBuffers(A, blasX, clblasX, deltaX); @@ -124,14 +122,11 @@ tbsvCorrectnessTest(TestParams *params) (A + params->offA), params->lda, (blasX + params->offBX), params->incx, (deltaX + params->offBX) ); memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); - ::std::cerr << "Calling reference xTBSV routine... "; - clblasOrder fOrder; clblasTranspose fTrans; clblasUplo fUplo; @@ -151,7 +146,6 @@ tbsvCorrectnessTest(TestParams *params) } clMath::blas::tbsv(fOrder, fUplo, fTrans, params->diag, fN, fK, A, params->offA, params->lda, blasX, params->offBX, params->incx); - ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL)) { // Skip the test, the most probable reason is @@ -169,7 +163,6 @@ tbsvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xTBSV routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT:( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; @@ -193,8 +186,6 @@ tbsvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, diff --git a/src/tests/correctness/corr-tpmv.cpp b/src/tests/correctness/corr-tpmv.cpp index 041154e9..b2ac494e 100644 --- a/src/tests/correctness/corr-tpmv.cpp +++ b/src/tests/correctness/corr-tpmv.cpp @@ -108,8 +108,6 @@ tpmvCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - // Set data in A and X using populate() routine int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT | PACKED_MATRIX; @@ -125,7 +123,6 @@ tpmvCorrectnessTest(TestParams *params) // Copy blasX to clblasX memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(AP, (lengthAP + params->offa)* sizeof(*AP), 0, CL_MEM_READ_ONLY); @@ -135,9 +132,6 @@ tpmvCorrectnessTest(TestParams *params) //printData( "bufX", blasX, lengthX, 1, lengthX); //printData( "clblasX", clblasX, lengthX, 1, lengthX); - ::std::cerr << "Calling reference xTPMV routine... "; - - clblasOrder order; clblasUplo fUplo; clblasTranspose fTrans; @@ -157,7 +151,6 @@ tpmvCorrectnessTest(TestParams *params) } ::clMath::blas::tpmv( order, fUplo, fTrans, params->diag, params->N, AP, params->offa, blasX, params->offBX, params->incx); - ::std::cerr << "Done" << ::std::endl; // Hold X vector @@ -177,8 +170,6 @@ tpmvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xTPMV routine... "; - DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; @@ -202,8 +193,6 @@ tpmvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, diff --git a/src/tests/correctness/corr-tpsv.cpp b/src/tests/correctness/corr-tpsv.cpp index 931d8214..b32d3612 100644 --- a/src/tests/correctness/corr-tpsv.cpp +++ b/src/tests/correctness/corr-tpsv.cpp @@ -109,8 +109,6 @@ tpsvCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - //custom generation function in blas-random.h randomTrsvMatrices( params->order, params->uplo, params->diag, params->N, (A + params->offa), 0, (blasX + params->offBX), params->incx); @@ -127,9 +125,6 @@ tpsvCorrectnessTest(TestParams *params) // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa)* sizeof(T), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(backX, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_WRITE_ONLY); - ::std::cerr << "Done" << ::std::endl; - - ::std::cerr << "Calling reference xTPSV routine... "; clblasOrder order; clblasUplo fUplo; @@ -150,7 +145,6 @@ tpsvCorrectnessTest(TestParams *params) doConjugate((A + params->offa), 1, lengthA, 1); } ::clMath::blas::tpsv( order, fUplo, fTrans, params->diag, params->N, A, params->offa, blasX, params->offBX, params->incx); - ::std::cerr << "Done" << ::std::endl; /* printf("\n\n acml result X\n"); @@ -173,8 +167,6 @@ tpsvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xTPSV routine... "; - DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; @@ -198,7 +190,6 @@ tpsvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, lengthX * sizeof(*backX), backX, 0, diff --git a/src/tests/correctness/corr-trmm.cpp b/src/tests/correctness/corr-trmm.cpp index c92e0e34..bb6dca61 100644 --- a/src/tests/correctness/corr-trmm.cpp +++ b/src/tests/correctness/corr-trmm.cpp @@ -73,8 +73,7 @@ trmmCorrectnessTest(TestParams *params) isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { - std::cerr << ">> Test is skipped because it has no importance for this " - "level of coverage" << std::endl; + std::cerr << ">> Test is skipped" << std::endl; SUCCEED(); return; } @@ -94,14 +93,11 @@ trmmCorrectnessTest(TestParams *params) alpha = convertMultiplier(params->alpha); } - ::std::cerr << "Generating input data... "; randomTrmmMatrices(params->order, params->side, params->uplo, params->diag, params->M, params->N, useAlpha, &alpha, A, params->lda, blasB, params->ldb); memcpy(clblasB, blasB, params->rowsB * params->columnsB * sizeof(*blasB)); - ::std::cerr << "Done" << ::std::endl; - ::std::cerr << "Calling reference xTRMM routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::trmm(clblasColumnMajor, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, @@ -124,7 +120,6 @@ trmmCorrectnessTest(TestParams *params) delete[] reorderedB; delete[] reorderedA; } - ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), @@ -149,7 +144,6 @@ trmmCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xTRMM routine... "; err = (cl_int)::clMath::clblas::trmm(params->order, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, bufA, params->offA, params->lda, bufB, params->offBX, @@ -170,7 +164,6 @@ trmmCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufB, CL_TRUE, params->offBX * sizeof(*clblasB), diff --git a/src/tests/correctness/corr-trmv.cpp b/src/tests/correctness/corr-trmv.cpp index 95089fc6..7eb14daf 100644 --- a/src/tests/correctness/corr-trmv.cpp +++ b/src/tests/correctness/corr-trmv.cpp @@ -108,8 +108,6 @@ trmvCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - // Set data in A and X using populate() routine int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; @@ -125,7 +123,6 @@ trmvCorrectnessTest(TestParams *params) // Copy blasX to clblasX memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa)* sizeof(*A), 0, CL_MEM_READ_ONLY); @@ -135,8 +132,6 @@ trmvCorrectnessTest(TestParams *params) //printData( "bufX", blasX, lengthX, 1, lengthX); //printData( "clblasX", clblasX, lengthX, 1, lengthX); - ::std::cerr << "Calling reference xTRMV routine... "; - clblasOrder order; clblasUplo fUplo; @@ -157,7 +152,6 @@ trmvCorrectnessTest(TestParams *params) } ::clMath::blas::trmv( order, fUplo, fTrans, params->diag, params->N, A, params->offa, params->lda, blasX, params->offBX, params->incx); - ::std::cerr << "Done" << ::std::endl; // Hold X vector @@ -177,8 +171,6 @@ trmvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xTRMV routine... "; - DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; @@ -202,8 +194,6 @@ trmvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; - err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, diff --git a/src/tests/correctness/corr-trsm.cpp b/src/tests/correctness/corr-trsm.cpp index e53331ca..a8a9253a 100644 --- a/src/tests/correctness/corr-trsm.cpp +++ b/src/tests/correctness/corr-trsm.cpp @@ -77,8 +77,7 @@ trsmCorrectnessTest(TestParams *params) isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { - std::cerr << ">> Test is skipped because it has no importance for this " - "level of coverage" << std::endl; + std::cerr << ">> Test is skipped" << std::endl; SUCCEED(); return; } @@ -100,17 +99,13 @@ trsmCorrectnessTest(TestParams *params) alpha = convertMultiplier(params->alpha); } - ::std::cerr << "Generating input data... "; - randomTrsmMatrices(params->order, params->side, params->uplo, params->diag, params->M, params->N, useAlpha, &alpha, A, params->lda, B, params->ldb); memcpy(blasB, B, params->rowsB * params->columnsB * sizeof(*B)); memcpy(clblasB, B, params->rowsB * params->columnsB * sizeof(*B)); - ::std::cerr << "Done" << ::std::endl; - ::std::cerr << "Calling reference xTRSM routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::trsm(clblasColumnMajor, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, A, @@ -135,7 +130,6 @@ trsmCorrectnessTest(TestParams *params) delete[] reorderedB; delete[] reorderedA; } - ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), @@ -160,7 +154,6 @@ trsmCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xTRSM routine... "; err = (cl_int)::clMath::clblas::trsm(params->order, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, bufA, params->offA, params->lda, bufB, params->offBX, @@ -181,7 +174,6 @@ trsmCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufB, CL_TRUE, params->offBX * sizeof(*clblasB), @@ -364,7 +356,6 @@ void Extratest(size_t M, size_t N, size_t lda, size_t ldb, T alpha, T delta) memcpy(blasB, B, N*ldb*sizeof(T)); memcpy(clblasB, B, N*ldb*sizeof(T)); - ::std::cerr << "Calling reference xTRSM routine... "; ::clMath::blas::trsm(order, side, uplo, trans, diag, M, N, alpha, A, lda, blasB, ldb); @@ -390,7 +381,6 @@ void Extratest(size_t M, size_t N, size_t lda, size_t ldb, T alpha, T delta) return; } - ::std::cerr << "Calling clblas xTRSM routine... "; err = (cl_int)::clMath::clblas::trsm(order, side, uplo, trans, diag, M, N, alpha, bufA, 0, lda, bufB, 0, ldb, 1, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { @@ -407,7 +397,6 @@ void Extratest(size_t M, size_t N, size_t lda, size_t ldb, T alpha, T delta) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufB, CL_TRUE, 0, N*ldb*sizeof(T), clblasB, 0, NULL, NULL); diff --git a/src/tests/correctness/corr-trsv.cpp b/src/tests/correctness/corr-trsv.cpp index b0ed0caf..f3def004 100644 --- a/src/tests/correctness/corr-trsv.cpp +++ b/src/tests/correctness/corr-trsv.cpp @@ -109,8 +109,6 @@ trsvCorrectnessTest(TestParams *params) srand(params->seed); - ::std::cerr << "Generating input data... "; - //custom generation function in blas-random.h randomTrsvMatrices( params->order, params->uplo, params->diag, params->N, (A + params->offa), params->lda, (blasX + params->offBX), params->incx); @@ -127,9 +125,6 @@ trsvCorrectnessTest(TestParams *params) // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa)* sizeof(T), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(backX, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_WRITE_ONLY); - ::std::cerr << "Done" << ::std::endl; - - ::std::cerr << "Calling reference xTRSV routine... "; clblasOrder order; clblasUplo fUplo; @@ -150,7 +145,6 @@ trsvCorrectnessTest(TestParams *params) doConjugate((A + params->offa), params->N, params->N, params->lda ); } ::clMath::blas::trsv( order, fUplo, fTrans, params->diag, params->N, A, params->offa, params->lda, blasX, params->offBX, params->incx); - ::std::cerr << "Done" << ::std::endl; /* printf("\n\n acml result X\n"); @@ -173,8 +167,6 @@ trsvCorrectnessTest(TestParams *params) return; } - ::std::cerr << "Calling clblas xTRSV routine... "; - DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; @@ -198,7 +190,6 @@ trsvCorrectnessTest(TestParams *params) delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } - ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, lengthX * sizeof(*backX), backX, 0, diff --git a/src/tests/include/BlasBase.h b/src/tests/include/BlasBase.h index 6c13e520..bcde47f2 100644 --- a/src/tests/include/BlasBase.h +++ b/src/tests/include/BlasBase.h @@ -33,8 +33,7 @@ do { \ \ if (err == CL_INVALID_DEVICE && !base->isDevSupportDoublePrecision()) { \ ::std::cerr << std::endl << ">> " << funcName << \ - "() reported that this device doesn't support double " \ - "precision floating point arithmetic. Test is skipped" << \ + "() no double; test is skipped" << \ ::std::endl; \ SUCCEED(); \ \ From ac1854d76c26d24a572c320db209ba633b9ce867 Mon Sep 17 00:00:00 2001 From: Kent Knox Date: Fri, 15 Apr 2016 18:01:55 -0500 Subject: [PATCH 30/45] Removing the printing of unit test parameters Changing unit tests to only print test parameters on unit test failures. --- src/tests/correctness/corr-asum.cpp | 8 ++++++++ src/tests/correctness/corr-axpy.cpp | 7 +++++++ src/tests/correctness/corr-copy.cpp | 7 +++++++ src/tests/correctness/corr-dot.cpp | 8 ++++++++ src/tests/correctness/corr-dotc.cpp | 8 ++++++++ src/tests/correctness/corr-gbmv.cpp | 9 +++++++++ src/tests/correctness/corr-gemm.cpp | 12 ++++++++++-- src/tests/correctness/corr-gemm2.cpp | 10 ++++++++++ src/tests/correctness/corr-gemv.cpp | 8 ++++++++ src/tests/correctness/corr-ger.cpp | 11 +++++++++++ src/tests/correctness/corr-gerc.cpp | 11 +++++++++++ src/tests/correctness/corr-hbmv.cpp | 9 +++++++++ src/tests/correctness/corr-hemm.cpp | 9 +++++++++ src/tests/correctness/corr-hemv.cpp | 9 +++++++++ src/tests/correctness/corr-her.cpp | 11 +++++++++-- src/tests/correctness/corr-her2.cpp | 9 ++++++++- src/tests/correctness/corr-her2k.cpp | 8 ++++++++ src/tests/correctness/corr-herk.cpp | 8 ++++++++ src/tests/correctness/corr-hpr.cpp | 2 -- src/tests/correctness/corr-iamax.cpp | 7 +++++++ src/tests/correctness/corr-nrm2.cpp | 7 +++++++ src/tests/correctness/corr-rot.cpp | 6 ++++++ src/tests/correctness/corr-rotg.cpp | 24 +++++++++++++++--------- src/tests/correctness/corr-rotm.cpp | 6 ++++++ src/tests/correctness/corr-rotmg.cpp | 6 ++++++ src/tests/correctness/corr-sbmv.cpp | 9 +++++++++ src/tests/correctness/corr-scal.cpp | 8 ++++++++ src/tests/correctness/corr-spmv.cpp | 9 +++++++++ src/tests/correctness/corr-spr.cpp | 1 - src/tests/correctness/corr-swap.cpp | 7 +++++++ src/tests/correctness/corr-symm.cpp | 9 +++++++++ src/tests/correctness/corr-symv.cpp | 9 +++++++++ src/tests/correctness/corr-syr.cpp | 9 ++++++++- src/tests/correctness/corr-syr2.cpp | 8 ++++++++ src/tests/correctness/corr-syr2k.cpp | 8 ++++++++ src/tests/correctness/corr-syrk.cpp | 8 ++++++++ src/tests/correctness/corr-tbmv.cpp | 9 +++++++++ src/tests/correctness/corr-tbsv.cpp | 9 +++++++++ src/tests/correctness/corr-trmm.cpp | 9 +++++++++ src/tests/correctness/corr-trmv.cpp | 8 ++++++++ src/tests/correctness/corr-trsm.cpp | 9 +++++++++ src/tests/correctness/corr-trsv.cpp | 8 ++++++++ src/tests/include/asum.h | 4 ---- src/tests/include/axpy.h | 3 --- src/tests/include/copy.h | 3 --- src/tests/include/dot.h | 4 ---- src/tests/include/dotc.h | 4 ---- src/tests/include/gbmv.h | 5 ----- src/tests/include/gemm-2.h | 6 ------ src/tests/include/gemm.h | 5 ----- src/tests/include/gemv.h | 5 ----- src/tests/include/ger.h | 8 -------- src/tests/include/gerc.h | 6 ------ src/tests/include/hbmv.h | 5 ----- src/tests/include/hemm.h | 5 ----- src/tests/include/hemv.h | 5 ----- src/tests/include/her.h | 6 ------ src/tests/include/her2.h | 5 ----- src/tests/include/her2k.h | 5 ----- src/tests/include/herk.h | 5 ----- src/tests/include/iamax.h | 3 --- src/tests/include/nrm2.h | 4 ---- src/tests/include/rot.h | 3 --- src/tests/include/rotg.h | 3 --- src/tests/include/rotm.h | 3 --- src/tests/include/rotmg.h | 3 --- src/tests/include/sbmv.h | 5 ----- src/tests/include/scal.h | 4 ---- src/tests/include/spmv.h | 5 ----- src/tests/include/swap.h | 3 --- src/tests/include/symm.h | 5 ----- src/tests/include/symv.h | 5 ----- src/tests/include/syr.h | 5 ----- src/tests/include/syr2.h | 5 ----- src/tests/include/syr2k.h | 5 ----- src/tests/include/syrk.h | 5 ----- src/tests/include/tbmv.h | 5 ----- src/tests/include/tbsv.h | 5 ----- src/tests/include/trmm.h | 5 ----- src/tests/include/trmv.h | 4 ---- src/tests/include/trsm.h | 5 ----- src/tests/include/trsv.h | 4 ---- 82 files changed, 339 insertions(+), 201 deletions(-) diff --git a/src/tests/correctness/corr-asum.cpp b/src/tests/correctness/corr-asum.cpp index 90d2b334..a05a34e6 100644 --- a/src/tests/correctness/corr-asum.cpp +++ b/src/tests/correctness/corr-asum.cpp @@ -167,6 +167,14 @@ asumCorrectnessTest(TestParams *params) releaseMemObjects(bufX, bufAsum, scratchBuff); compareMatrices(clblasColumnMajor, 1 , 1, (blasAsum), (clblasAsum+params->offa), 1); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->N, params->offBX, params->incx); + ::std::cerr << "offAsum = " << params->offa << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(blasX); deleteBuffers(blasAsum, clblasAsum); delete[] events; diff --git a/src/tests/correctness/corr-axpy.cpp b/src/tests/correctness/corr-axpy.cpp index 1aa3246e..b23e6e08 100644 --- a/src/tests/correctness/corr-axpy.cpp +++ b/src/tests/correctness/corr-axpy.cpp @@ -170,6 +170,13 @@ axpyCorrectnessTest(TestParams *params) releaseMemObjects(bufX, bufY); compareMatrices(clblasRowMajor, lengthY , 1, (blasY + params->offCY), (Y + params->offCY), 1); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->N, params->alpha, params->offBX, params->incx, params->offCY, params->incy); + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(X, Y, blasX, blasY); delete[] events; } diff --git a/src/tests/correctness/corr-copy.cpp b/src/tests/correctness/corr-copy.cpp index 625567f1..468a20e9 100644 --- a/src/tests/correctness/corr-copy.cpp +++ b/src/tests/correctness/corr-copy.cpp @@ -165,6 +165,13 @@ copyCorrectnessTest(TestParams *params) releaseMemObjects(bufX, bufY); compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY, NULL); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->N, params->offBX, params->incx, params->offCY, params->incy); + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(blasX, blasY, clblasY); delete[] events; } diff --git a/src/tests/correctness/corr-dot.cpp b/src/tests/correctness/corr-dot.cpp index d72b2374..02fd84ec 100644 --- a/src/tests/correctness/corr-dot.cpp +++ b/src/tests/correctness/corr-dot.cpp @@ -173,6 +173,14 @@ dotCorrectnessTest(TestParams *params) releaseMemObjects(bufX, bufY, bufDP, scratchBuff); compareMatrices(clblasColumnMajor, 1 , 1, (blasDP), (clblasDP+params->offa), 1); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->N, params->offBX, params->incx, params->offCY, params->incy); + ::std::cerr << "offDP = " << params->offa << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(blasX, blasY, blasDP, clblasDP); delete[] events; } diff --git a/src/tests/correctness/corr-dotc.cpp b/src/tests/correctness/corr-dotc.cpp index c2bc481b..8db5550b 100644 --- a/src/tests/correctness/corr-dotc.cpp +++ b/src/tests/correctness/corr-dotc.cpp @@ -173,6 +173,14 @@ dotcCorrectnessTest(TestParams *params) releaseMemObjects(bufX, bufY, bufDP, scratchBuff); compareMatrices(clblasColumnMajor, 1 , 1, (blasDP), (clblasDP+params->offa), 1); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->N, params->offBX, params->incx, params->offCY, params->incy); + ::std::cerr << "offDP = " << params->offa << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(blasX, blasY, blasDP, clblasDP); delete[] events; } diff --git a/src/tests/correctness/corr-gbmv.cpp b/src/tests/correctness/corr-gbmv.cpp index f64fad14..5bb80c37 100644 --- a/src/tests/correctness/corr-gbmv.cpp +++ b/src/tests/correctness/corr-gbmv.cpp @@ -203,6 +203,15 @@ gbmvCorrectnessTest(TestParams *params) releaseMemObjects(bufA, bufX, bufY); compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->transA, params->M, params->N, params->KL, params->KU, params->alpha, params->offA, + params->lda, params->offBX, params->incx, params->beta, params->offCY, params->incy); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, X, blasY, clblasY); delete[] events; } diff --git a/src/tests/correctness/corr-gemm.cpp b/src/tests/correctness/corr-gemm.cpp index 61821f4d..6280a473 100644 --- a/src/tests/correctness/corr-gemm.cpp +++ b/src/tests/correctness/corr-gemm.cpp @@ -103,12 +103,10 @@ gemmCorrectnessTest(TestParams *params) beta = convertMultiplier(params->beta); } - //::std::cerr << "Generating input data... "; randomGemmMatrices(params->order, params->transA, params->transB, params->M, params->N, params->K, useAlpha, &alpha, A, params->lda, B, params->ldb, useBeta, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); - //::std::cerr << "Done" << ::std::endl; if (params->order == clblasColumnMajor) { ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB, @@ -193,6 +191,16 @@ gemmCorrectnessTest(TestParams *params) releaseMemObjects(bufA, bufB, bufC); compareMatrices(params->order, params->M, params->N, blasC, clblasC, params->ldc); + + if (::testing::Test::HasFailure( ) ) + { + printTestParams(params->order, params->transA, params->transB, params->M, params->N, params->K, base->useAlpha(), + base->alpha(), params->offA, params->lda, params->offBX, params->ldb, base->useBeta(), + base->beta(), params->offCY, params->ldc); + ::std::cerr << " seed = " << params->seed << ", " + << "queues = " << params->numCommandQueues << ", "; + } + deleteBuffers(A, B, blasC, clblasC); delete[] events; } diff --git a/src/tests/correctness/corr-gemm2.cpp b/src/tests/correctness/corr-gemm2.cpp index 116816aa..1bb790df 100644 --- a/src/tests/correctness/corr-gemm2.cpp +++ b/src/tests/correctness/corr-gemm2.cpp @@ -215,6 +215,16 @@ gemm2CorrectnessTest(TestParams *params) releaseMemObjects(bufA, bufB, bufC); compareMatrices(params->order, params->M, params->N, blasC, clblasC, params->ldc); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->transA, params->transB, params->M, params->N, params->K, base->useAlpha(), + base->alpha(), params->offA, params->lda, params->offBX, params->ldb, base->useBeta(), + base->beta(), params->offCY, params->ldc); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, B, blasC, clblasC); delete[] events; } diff --git a/src/tests/correctness/corr-gemv.cpp b/src/tests/correctness/corr-gemv.cpp index f3d5b755..279491e1 100644 --- a/src/tests/correctness/corr-gemv.cpp +++ b/src/tests/correctness/corr-gemv.cpp @@ -202,6 +202,14 @@ gemvCorrectnessTest(TestParams *params) compareVectors(params->offCY, lenY, abs(params->incy), params->columnsC * params->rowsC, blasC, clblasC); + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->transA, params->M, params->N, base->useAlpha(), base->alpha(), params->offA, + params->lda, params->incx, base->useBeta(), base->beta(), params->incy); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, B, blasC, clblasC); delete[] events; } diff --git a/src/tests/correctness/corr-ger.cpp b/src/tests/correctness/corr-ger.cpp index 1c132761..7c3a6709 100644 --- a/src/tests/correctness/corr-ger.cpp +++ b/src/tests/correctness/corr-ger.cpp @@ -218,6 +218,17 @@ gerCorrectnessTest(TestParams *params) // handle lda correctly based on row-major/col-major.. compareMatrices(params->order, params->M , params->N, A+ params->offa, backA + params->offa, params->lda); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->M, params->N, useAlpha, + base->alpha(), + params->lda, params->incx, params->incy, params->offa, params->offBX, params->offCY); + + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, x, y, backA); delete[] events; } diff --git a/src/tests/correctness/corr-gerc.cpp b/src/tests/correctness/corr-gerc.cpp index ec5bfaad..81d8833e 100644 --- a/src/tests/correctness/corr-gerc.cpp +++ b/src/tests/correctness/corr-gerc.cpp @@ -220,6 +220,17 @@ gercCorrectnessTest(TestParams *params) // handle lda correctly based on row-major/col-major.. compareMatrices(params->order, params->M , params->N, A+ params->offa, backA + params->offa, params->lda); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->M, params->N, useAlpha, + base->alpha(), + params->lda, params->incx, params->incy, params->offa, params->offBX, params->offCY); + + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, x, y, backA); delete[] events; } diff --git a/src/tests/correctness/corr-hbmv.cpp b/src/tests/correctness/corr-hbmv.cpp index accecd79..846dc8d9 100644 --- a/src/tests/correctness/corr-hbmv.cpp +++ b/src/tests/correctness/corr-hbmv.cpp @@ -193,6 +193,15 @@ hbmvCorrectnessTest(TestParams *params) releaseMemObjects(bufA, bufX, bufY); compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->N, params->K, params->alpha, params->offA, + params->lda, params->offBX, params->incx, params->beta, params->offCY, params->incy); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, X, blasY, clblasY); delete[] events; } diff --git a/src/tests/correctness/corr-hemm.cpp b/src/tests/correctness/corr-hemm.cpp index 8f540a28..69536d8b 100644 --- a/src/tests/correctness/corr-hemm.cpp +++ b/src/tests/correctness/corr-hemm.cpp @@ -226,6 +226,15 @@ hemmCorrectnessTest(TestParams *params) // handle lda correctly based on row-major/col-major.. compareMatrices(params->order, params->M , params->N, (C + params->offCY), (backC + params->offCY), params->ldc); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->side, params->uplo, params->M, params->N, 1, params->alpha, 1, params->beta, params->lda, params->ldb, params->ldc, params->offA, params->offb, params->offc); + + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, B, C, backC); delete[] events; } diff --git a/src/tests/correctness/corr-hemv.cpp b/src/tests/correctness/corr-hemv.cpp index cced473d..2f2517d6 100644 --- a/src/tests/correctness/corr-hemv.cpp +++ b/src/tests/correctness/corr-hemv.cpp @@ -225,6 +225,15 @@ hemvCorrectnessTest(TestParams *params) */ compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->N, params->alpha, params->offA, + params->lda, params->offBX, params->incx, params->beta, params->offCY, params->incy); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, X, blasY, clblasY); delete[] events; } diff --git a/src/tests/correctness/corr-her.cpp b/src/tests/correctness/corr-her.cpp index 73f5be30..57b95f21 100644 --- a/src/tests/correctness/corr-her.cpp +++ b/src/tests/correctness/corr-her.cpp @@ -103,7 +103,6 @@ herCorrectnessTest(TestParams *params) randomHerMatrices( params->order, params->uplo, params->N, &alpha_, (A + params->offa), params->lda, (X + params->offBX), params->incx ); memcpy(backA, A, (lengthA + params->offa)* sizeof(*A)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(*A), 0, CL_MEM_READ_WRITE); @@ -178,10 +177,18 @@ herCorrectnessTest(TestParams *params) releaseMemObjects(bufA, bufX); - printf("Comparing the results\n"); compareMatrices(params->order, params->N , params->N, (A + params->offa), (backA + params->offa), params->lda); + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->N, params->alpha.re, + params->offBX, params->incx, params->offa, params->lda); + + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers( A, backA, X); delete[] events; } diff --git a/src/tests/correctness/corr-her2.cpp b/src/tests/correctness/corr-her2.cpp index f041420f..e72db799 100644 --- a/src/tests/correctness/corr-her2.cpp +++ b/src/tests/correctness/corr-her2.cpp @@ -120,7 +120,6 @@ her2CorrectnessTest(TestParams *params) // Copy blasA to clblasA memcpy(clblasA, blasA, (lengthA + params->offa)* sizeof(*blasA)); - ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(clblasA, (lengthA + params->offa)* sizeof(*clblasA), 0,CL_MEM_READ_WRITE); @@ -194,6 +193,14 @@ her2CorrectnessTest(TestParams *params) compareMatrices(params->order, params->N , params->N, (blasA + params->offa), (clblasA + params->offa), params->lda); + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->N, 1, params->alpha, params->offBX, params->incx, params->offCY, params->incy, params->offa, params->lda); + + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(blasA, clblasA, X, Y); delete[] events; } diff --git a/src/tests/correctness/corr-her2k.cpp b/src/tests/correctness/corr-her2k.cpp index 21434ec1..06675c85 100644 --- a/src/tests/correctness/corr-her2k.cpp +++ b/src/tests/correctness/corr-her2k.cpp @@ -184,6 +184,14 @@ her2kCorrectnessTest(TestParams *params) releaseMemObjects(bufA, bufB, bufC); compareMatrices(params->order, params->N, params->N, blasC, clblasC, params->ldc); + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->transA, params->N, params->K, true, params->alpha, + params->offa, params->lda, params->offBX, params->ldb, true, params->beta, params->offCY, params->ldc); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, B, blasC, clblasC); delete[] events; } diff --git a/src/tests/correctness/corr-herk.cpp b/src/tests/correctness/corr-herk.cpp index f64eb40b..8158984c 100644 --- a/src/tests/correctness/corr-herk.cpp +++ b/src/tests/correctness/corr-herk.cpp @@ -211,6 +211,14 @@ herkCorrectnessTest(TestParams *params) compareMatrices(params->order, params->N, params->N, blasC, clblasC, params->ldc); + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->transA, params->N, params->K, true, params->alpha, + params->offA, params->lda, true, params->beta, params->offCY, params->ldc); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, blasC, clblasC); delete[] events; } diff --git a/src/tests/correctness/corr-hpr.cpp b/src/tests/correctness/corr-hpr.cpp index 4cb539f4..d7d7e380 100644 --- a/src/tests/correctness/corr-hpr.cpp +++ b/src/tests/correctness/corr-hpr.cpp @@ -176,8 +176,6 @@ hprCorrectnessTest(TestParams *params) releaseMemObjects(bufAP, bufX); - printf("Comparing the results\n"); - compareMatrices(clblasColumnMajor, lengthAP, 1, (AP + params->offa), (backA + params->offa), lengthAP); deleteBuffers( AP, backA, X); diff --git a/src/tests/correctness/corr-iamax.cpp b/src/tests/correctness/corr-iamax.cpp index 040b918f..1fbe6f84 100644 --- a/src/tests/correctness/corr-iamax.cpp +++ b/src/tests/correctness/corr-iamax.cpp @@ -161,6 +161,13 @@ iamaxCorrectnessTest(TestParams *params) } compareValues((blasiAmax), (clblasiAmax+params->offa), 0); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->N, params->offBX, params->incx); + ::std::cerr << "offiAmax = " << params->offa << ::std::endl; + } + releaseMemObjects(bufX, bufiAmax, scratchBuff); deleteBuffers(blasX, blasiAmax, clblasiAmax); delete[] events; diff --git a/src/tests/correctness/corr-nrm2.cpp b/src/tests/correctness/corr-nrm2.cpp index 34bce308..9fad6122 100644 --- a/src/tests/correctness/corr-nrm2.cpp +++ b/src/tests/correctness/corr-nrm2.cpp @@ -173,6 +173,13 @@ nrm2CorrectnessTest(TestParams *params) } compareValues( (blasNRM2), (clblasNRM2+params->offa), delta); + if (::testing::Test::HasFailure()) + { + printTestParams(params->N, params->offBX, params->incx); + ::std::cerr << "offNRM2 = " << params->offa << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(blasX); deleteBuffers(blasNRM2, clblasNRM2); delete[] events; diff --git a/src/tests/correctness/corr-rot.cpp b/src/tests/correctness/corr-rot.cpp index 3e09c94a..fb208eb8 100644 --- a/src/tests/correctness/corr-rot.cpp +++ b/src/tests/correctness/corr-rot.cpp @@ -182,6 +182,12 @@ rotCorrectnessTest(TestParams *params) compareMatrices(clblasRowMajor, lengthx , 1, (back_X + params->offa), (X + params->offa), 1); compareMatrices(clblasRowMajor, lengthy , 1, (back_Y + params->offb), (Y + params->offb), 1); + if (::testing::Test::HasFailure()) + { + printTestParams(params->N, params->offa, params->incx, params->offb, params->incy, params->alpha, params->beta); + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(X, Y, back_X, back_Y); delete[] events; } diff --git a/src/tests/correctness/corr-rotg.cpp b/src/tests/correctness/corr-rotg.cpp index 4616533a..05be85cc 100644 --- a/src/tests/correctness/corr-rotg.cpp +++ b/src/tests/correctness/corr-rotg.cpp @@ -144,15 +144,15 @@ rotgCorrectnessTest(TestParams *params) back_SB[params->offCY] = SB[params->offCY]; //printing the inputs, as they change after processing - ::std::cerr << "A = "; - printElement(SA[params->offBX]); - ::std::cerr << "\tB = "; - printElement(SB[params->offCY]); - ::std::cerr << "\tC = "; - printElement(C[params->offa]); - ::std::cerr << "\tS = "; - printElement(S[params->offb]); - ::std::cout << std::endl << std::endl; + //::std::cerr << "A = "; + //printElement(SA[params->offBX]); + //::std::cerr << "\tB = "; + //printElement(SB[params->offCY]); + //::std::cerr << "\tC = "; + //printElement(C[params->offa]); + //::std::cerr << "\tS = "; + //printElement(S[params->offb]); + //::std::cout << std::endl << std::endl; // Allocate buffers bufSA = base->createEnqueueBuffer(SA, (length + params->offBX) * sizeof(T1), 0, CL_MEM_READ_WRITE); @@ -241,6 +241,12 @@ rotgCorrectnessTest(TestParams *params) delta = deltaForType * returnMax(back_S[params->offb]); compareValues( (back_S + params->offb), (S + params->offb), delta); + if (::testing::Test::HasFailure()) + { + printTestParams(params->offBX, params->offCY, params->offa, params->offb); + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(SA, SB, S, back_SA, back_SB, back_S); deleteBuffers(C, back_C); delete[] events; diff --git a/src/tests/correctness/corr-rotm.cpp b/src/tests/correctness/corr-rotm.cpp index 33f76485..c791108e 100644 --- a/src/tests/correctness/corr-rotm.cpp +++ b/src/tests/correctness/corr-rotm.cpp @@ -199,6 +199,12 @@ rotmCorrectnessTest(TestParams *params) compareMatrices(clblasColumnMajor, lengthx , 1, (back_X + params->offa), (X + params->offa), lengthx); compareMatrices(clblasColumnMajor, lengthy , 1, (back_Y + params->offb), (Y + params->offb), lengthy); + if (::testing::Test::HasFailure()) + { + printTestParams(params->N, params->offa, params->incx, params->offb, params->incy, params->offc, params->alpha); + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(X, Y, PARAM, back_X, back_Y, back_PARAM); delete[] events; } diff --git a/src/tests/correctness/corr-rotmg.cpp b/src/tests/correctness/corr-rotmg.cpp index 8c448feb..1ff65551 100644 --- a/src/tests/correctness/corr-rotmg.cpp +++ b/src/tests/correctness/corr-rotmg.cpp @@ -249,6 +249,12 @@ rotmgCorrectnessTest(TestParams *params) } compareMatrices(clblasColumnMajor, 5 , 1, (back_PARAM + params->offc), (PARAM + params->offc), 5, deltaArr); + if (::testing::Test::HasFailure()) + { + printTestParams(params->offBX, params->offCY, params->offa, params->offb, params->offc, params->alpha); + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(D1, D2, X, Y, PARAM); deleteBuffers(back_D1, back_D2, back_X, back_Y, back_PARAM); diff --git a/src/tests/correctness/corr-sbmv.cpp b/src/tests/correctness/corr-sbmv.cpp index fc1eacdb..bf974c59 100644 --- a/src/tests/correctness/corr-sbmv.cpp +++ b/src/tests/correctness/corr-sbmv.cpp @@ -192,6 +192,15 @@ sbmvCorrectnessTest(TestParams *params) releaseMemObjects(bufA, bufX, bufY); compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->N, params->K, params->alpha, params->offA, + params->lda, params->offBX, params->incx, params->beta, params->offCY, params->incy); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, X, blasY, clblasY); delete[] events; } diff --git a/src/tests/correctness/corr-scal.cpp b/src/tests/correctness/corr-scal.cpp index ecd8b829..72d62464 100644 --- a/src/tests/correctness/corr-scal.cpp +++ b/src/tests/correctness/corr-scal.cpp @@ -150,6 +150,14 @@ void scalCorrectnessTest(TestParams *params) compareMatrices(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX), lengthX); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->N, params->alpha, params->offBX, params->incx); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(blasX, clblasX); delete[] events; } diff --git a/src/tests/correctness/corr-spmv.cpp b/src/tests/correctness/corr-spmv.cpp index 52aded01..58f14c4a 100644 --- a/src/tests/correctness/corr-spmv.cpp +++ b/src/tests/correctness/corr-spmv.cpp @@ -189,6 +189,15 @@ spmvCorrectnessTest(TestParams *params) compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->N, base->alpha(), params->offA, + 0, params->offBX, params->incx, base->beta(), params->offCY, params->incy); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(AP, X, blasY, clblasY); delete[] events; } diff --git a/src/tests/correctness/corr-spr.cpp b/src/tests/correctness/corr-spr.cpp index 4851e7de..56081122 100644 --- a/src/tests/correctness/corr-spr.cpp +++ b/src/tests/correctness/corr-spr.cpp @@ -192,7 +192,6 @@ sprCorrectnessTest(TestParams *params) } releaseMemObjects(bufAP, bufX); - printf("Comparing the results\n"); compareMatrices(clblasColumnMajor, lengthAP , 1, (blasAP + params->offa), (clblasAP + params->offa), lengthAP); diff --git a/src/tests/correctness/corr-swap.cpp b/src/tests/correctness/corr-swap.cpp index db1eb6d5..706779d0 100644 --- a/src/tests/correctness/corr-swap.cpp +++ b/src/tests/correctness/corr-swap.cpp @@ -174,6 +174,13 @@ swapCorrectnessTest(TestParams *params) compareMatrices(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (X + params->offBX), lengthX); compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (Y + params->offCY), lengthY); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->N, params->offBX, params->incx, params->offCY, params->incy); + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(X, Y, blasX, blasY); delete[] events; } diff --git a/src/tests/correctness/corr-symm.cpp b/src/tests/correctness/corr-symm.cpp index 4af106dd..c4670ac8 100644 --- a/src/tests/correctness/corr-symm.cpp +++ b/src/tests/correctness/corr-symm.cpp @@ -227,6 +227,15 @@ symmCorrectnessTest(TestParams *params) // handle lda correctly based on row-major/col-major.. compareMatrices(params->order, params->M , params->N, (C + params->offc), (backC + params->offc), params->ldc); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->side, params->uplo, params->M, params->N, 1, params->alpha, 1, params->beta, params->lda, params->ldb, params->ldc, params->offa, params->offb, params->offc); + + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, B, C, backC); delete[] events; } diff --git a/src/tests/correctness/corr-symv.cpp b/src/tests/correctness/corr-symv.cpp index d4c33c65..c466abfc 100644 --- a/src/tests/correctness/corr-symv.cpp +++ b/src/tests/correctness/corr-symv.cpp @@ -194,6 +194,15 @@ symvCorrectnessTest(TestParams *params) compareVectors(params->offCY, params->N, abs(params->incy), params->columnsC * params->rowsC, blasC, clblasC); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->N, useAlpha, base->alpha(), params->offA, params->lda, + params->incx, useBeta, base->beta(), params->incy); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, B, blasC, clblasC); delete[] events; } diff --git a/src/tests/correctness/corr-syr.cpp b/src/tests/correctness/corr-syr.cpp index cae06fcc..905585bb 100644 --- a/src/tests/correctness/corr-syr.cpp +++ b/src/tests/correctness/corr-syr.cpp @@ -229,10 +229,17 @@ syrCorrectnessTest(TestParams *params) // compareMatrices(clblasColumnMajor, 1, (params->lda - params->N), (blasA + params->offa + params->N), (tempA + params->offa + params->N), // params->lda); // delete[] tempA; - printf("Comparing the results\n"); compareMatrices(params->order, params->N , params->N, (blasA + params->offa), (clblasA + params->offa), params->lda); + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->N, alpha, params->offBX, params->incx, params->offa, params->lda); + + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(blasA, clblasA, X); delete[] events; } diff --git a/src/tests/correctness/corr-syr2.cpp b/src/tests/correctness/corr-syr2.cpp index 303cce58..453f03c2 100644 --- a/src/tests/correctness/corr-syr2.cpp +++ b/src/tests/correctness/corr-syr2.cpp @@ -187,6 +187,14 @@ syr2CorrectnessTest(TestParams *params) compareMatrices(clblasColumnMajor, params->N , params->N, (blasA + params->offa), (clblasA + params->offa), params->lda); + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->N, alpha, params->offBX, params->incx, params->offCY, params->incy, params->offa, params->lda); + + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(blasA, clblasA, X, Y); delete[] events; } diff --git a/src/tests/correctness/corr-syr2k.cpp b/src/tests/correctness/corr-syr2k.cpp index 1c05161e..313cfc6a 100644 --- a/src/tests/correctness/corr-syr2k.cpp +++ b/src/tests/correctness/corr-syr2k.cpp @@ -217,6 +217,14 @@ syr2kCorrectnessTest(TestParams *params) compareMatrices(params->order, params->N, params->N, blasC, clblasC, params->ldc); + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->transA, params->N, params->K, useAlpha, base->alpha(), + params->offA, params->lda, params->offBX, params->ldb, useBeta, base->beta(), params->offCY, params->ldc); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, B, blasC, clblasC); delete[] events; } diff --git a/src/tests/correctness/corr-syrk.cpp b/src/tests/correctness/corr-syrk.cpp index 709ad49a..0e85505b 100644 --- a/src/tests/correctness/corr-syrk.cpp +++ b/src/tests/correctness/corr-syrk.cpp @@ -201,6 +201,14 @@ syrkCorrectnessTest(TestParams *params) compareMatrices(params->order, params->N, params->N, blasC, clblasC, params->ldc); + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->transA, params->N, params->K, useAlpha, base->alpha(), + params->offA, params->lda, useBeta, base->beta(), params->offCY, params->ldc); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, blasC, clblasC); delete[] events; } diff --git a/src/tests/correctness/corr-tbmv.cpp b/src/tests/correctness/corr-tbmv.cpp index 5b37ee95..1fa78743 100644 --- a/src/tests/correctness/corr-tbmv.cpp +++ b/src/tests/correctness/corr-tbmv.cpp @@ -189,6 +189,15 @@ tbmvCorrectnessTest(TestParams *params) releaseMemObjects(bufA, bufX, bufXtemp); compareMatrices(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX), lengthX); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->transA, params->diag, params->N, params->K, params->offA, + params->lda, params->offBX, params->incx, 0, 1); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, blasX, clblasX); delete[] events; } diff --git a/src/tests/correctness/corr-tbsv.cpp b/src/tests/correctness/corr-tbsv.cpp index 36947c04..eb55b188 100644 --- a/src/tests/correctness/corr-tbsv.cpp +++ b/src/tests/correctness/corr-tbsv.cpp @@ -198,6 +198,15 @@ tbsvCorrectnessTest(TestParams *params) releaseMemObjects(bufA, bufX); compareMatrices(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX), lengthX, (deltaX + params->offBX) ); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->transA, params->diag, params->N, params->K, params->offA, + params->lda, params->offBX, params->incx, 0, 1); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, blasX, clblasX, deltaX); delete[] events; } diff --git a/src/tests/correctness/corr-trmm.cpp b/src/tests/correctness/corr-trmm.cpp index bb6dca61..7cc62dfa 100644 --- a/src/tests/correctness/corr-trmm.cpp +++ b/src/tests/correctness/corr-trmm.cpp @@ -173,6 +173,15 @@ trmmCorrectnessTest(TestParams *params) releaseMemObjects(bufA, bufB); compareMatrices(params->order, params->M, params->N, blasB, clblasB, params->ldb); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->side, params->uplo, params->transA, params->diag, params->M, params->N, useAlpha, + base->alpha(), params->offA, params->lda, params->offBX, params->ldb); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, blasB, clblasB); delete[] events; } diff --git a/src/tests/correctness/corr-trmv.cpp b/src/tests/correctness/corr-trmv.cpp index 7eb14daf..7e97d6c3 100644 --- a/src/tests/correctness/corr-trmv.cpp +++ b/src/tests/correctness/corr-trmv.cpp @@ -213,6 +213,14 @@ trmvCorrectnessTest(TestParams *params) compareMatrices(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX), lengthX); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->transA, params->diag, params->N, params->lda, params->incx, params->offa, params->offBX); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, blasX, clblasX); delete[] events; } diff --git a/src/tests/correctness/corr-trsm.cpp b/src/tests/correctness/corr-trsm.cpp index a8a9253a..aa368f25 100644 --- a/src/tests/correctness/corr-trsm.cpp +++ b/src/tests/correctness/corr-trsm.cpp @@ -188,6 +188,15 @@ trsmCorrectnessTest(TestParams *params) compareMatrices(params->order, params->M, params->N, blasB, clblasB, params->ldb, delta); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->side, params->uplo, params->transA, params->diag, params->M, params->N, useAlpha, + base->alpha(), params->offA, params->lda, params->offBX, params->ldb); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, B, blasB, clblasB, delta); delete[] events; } diff --git a/src/tests/correctness/corr-trsv.cpp b/src/tests/correctness/corr-trsv.cpp index f3def004..dbda6ae7 100644 --- a/src/tests/correctness/corr-trsv.cpp +++ b/src/tests/correctness/corr-trsv.cpp @@ -208,6 +208,14 @@ trsvCorrectnessTest(TestParams *params) // handle lda correctly based on row-major/col-major.. compareMatrices( clblasColumnMajor, lengthX , 1, blasX, backX, lengthX, deltaX ); + + if (::testing::Test::HasFailure()) + { + printTestParams(params->order, params->uplo, params->transA, params->diag, params->N, params->lda, params->incx, params->offa, params->offBX); + ::std::cerr << "seed = " << params->seed << ::std::endl; + ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; + } + deleteBuffers(A, blasX, backX, deltaX); delete[] events; } diff --git a/src/tests/include/asum.h b/src/tests/include/asum.h index 0c3f508c..131ff68f 100644 --- a/src/tests/include/asum.h +++ b/src/tests/include/asum.h @@ -61,10 +61,6 @@ class ASUM : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(N, offx, incx); - ::std::cerr << "offAsum = " << offAsum << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; diff --git a/src/tests/include/axpy.h b/src/tests/include/axpy.h index c2301774..3bb8b140 100644 --- a/src/tests/include/axpy.h +++ b/src/tests/include/axpy.h @@ -71,9 +71,6 @@ class AXPY : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(N, paramAlpha, offBX, incx, offCY, incy); - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; diff --git a/src/tests/include/copy.h b/src/tests/include/copy.h index 5c26a228..f77130e3 100644 --- a/src/tests/include/copy.h +++ b/src/tests/include/copy.h @@ -63,9 +63,6 @@ class COPY : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(N, offx, incx, offy, incy); - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; diff --git a/src/tests/include/dot.h b/src/tests/include/dot.h index 7b15528e..55962d37 100644 --- a/src/tests/include/dot.h +++ b/src/tests/include/dot.h @@ -67,10 +67,6 @@ class DOT : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(N, offx, incx, offy, incy); - ::std::cerr << "offDP = " << offDP << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; diff --git a/src/tests/include/dotc.h b/src/tests/include/dotc.h index 815ad1cd..ffcdbcf2 100644 --- a/src/tests/include/dotc.h +++ b/src/tests/include/dotc.h @@ -67,10 +67,6 @@ class DOTC : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(N, offx, incx, offy, incy); - ::std::cerr << "offDP = " << offDP << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; diff --git a/src/tests/include/gbmv.h b/src/tests/include/gbmv.h index f26dbd7b..a6823ef4 100644 --- a/src/tests/include/gbmv.h +++ b/src/tests/include/gbmv.h @@ -99,11 +99,6 @@ class GBMV : public TestWithParam< KL = KL % M; KU = KU % N; lda = ::std::max(lda, (KL+KU+1)); - - printTestParams(order, transA, M, N, KL, KU, paramAlpha, offA, - lda, offx, incx, paramBeta, offy, incy); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/gemm-2.h b/src/tests/include/gemm-2.h index a4103e77..5f54843b 100644 --- a/src/tests/include/gemm-2.h +++ b/src/tests/include/gemm-2.h @@ -148,12 +148,6 @@ class GEMM2 : public TestWithParam< rowsC = ldc; break; } - - printTestParams(order, transA, transB, M, N, K, useAlpha, - base->alpha(), offA, lda, offB, ldb, useBeta, - base->beta(), offC, ldc); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/gemm.h b/src/tests/include/gemm.h index 1307fac4..2156df39 100644 --- a/src/tests/include/gemm.h +++ b/src/tests/include/gemm.h @@ -151,11 +151,6 @@ class GEMM : public TestWithParam< break; } - ::std::cerr << " seed = " << seed << ", " - << "queues = " << numCommandQueues << ", "; - printTestParams(order, transA, transB, M, N, K, useAlpha, - base->alpha(), offA, lda, offB, ldb, useBeta, - base->beta(), offC, ldc); } clblasOrder order; diff --git a/src/tests/include/gemv.h b/src/tests/include/gemv.h index 77f1c8ec..66080c96 100644 --- a/src/tests/include/gemv.h +++ b/src/tests/include/gemv.h @@ -227,11 +227,6 @@ class GEMV : public TestWithParam< if (!seqY) { incy = incy > 0 ? (int)ldc : (int)(0-ldc); } - - printTestParams(order, transA, M, N, useAlpha, base->alpha(), offA, - lda, incx, useBeta, base->beta(), incy); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/ger.h b/src/tests/include/ger.h index 3c746dd9..ba42f270 100644 --- a/src/tests/include/ger.h +++ b/src/tests/include/ger.h @@ -102,14 +102,6 @@ class GER : public TestWithParam< lda = ::std::max(lda, rowsA); break; } - - - printTestParams(order, M, N, useAlpha, - base->alpha(), - lda, incx, incy, offa, offx, offy); - - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/gerc.h b/src/tests/include/gerc.h index 23b09c31..cca220f1 100644 --- a/src/tests/include/gerc.h +++ b/src/tests/include/gerc.h @@ -98,12 +98,6 @@ class GERC : public TestWithParam< if( lda == 0 ) lda = ::std::max(M, N); - printTestParams(order, M, N, useAlpha, - base->alpha(), - lda, incx, incy, offa, offx, offy); - - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/hbmv.h b/src/tests/include/hbmv.h index adbd022d..6056c5e0 100644 --- a/src/tests/include/hbmv.h +++ b/src/tests/include/hbmv.h @@ -93,11 +93,6 @@ class HBMV : public TestWithParam< KLU = KLU % N; lda = ::std::max(lda, (KLU+1)); - - printTestParams(order, uplo, N, KLU, paramAlpha, offA, - lda, offx, incx, paramBeta, offy, incy); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/hemm.h b/src/tests/include/hemm.h index 000c897a..98ceb8d9 100644 --- a/src/tests/include/hemm.h +++ b/src/tests/include/hemm.h @@ -118,11 +118,6 @@ class HEMM : public TestWithParam< ldc = ::std::max(ldc, M); break; } - - printTestParams(order, side, uplo, M, N, 1, alpha, 1, beta, lda, ldb, ldc, offA, offb, offc); - - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/hemv.h b/src/tests/include/hemv.h index dcdb84a7..0adba22b 100644 --- a/src/tests/include/hemv.h +++ b/src/tests/include/hemv.h @@ -121,11 +121,6 @@ class HPMV : public TestWithParam< if (base->useIncY()) { incy = base->incY(); } - - printTestParams(order, uplo, N, paramAlpha, offA, - lda, offx, incx, paramBeta, offy, incy); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/her.h b/src/tests/include/her.h index 84b405c5..505321de 100644 --- a/src/tests/include/her.h +++ b/src/tests/include/her.h @@ -96,12 +96,6 @@ class HPR : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(order, uplo, N, alpha, - offx, incx, offa, lda ); - - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/her2.h b/src/tests/include/her2.h index f64cb303..3b8eae02 100644 --- a/src/tests/include/her2.h +++ b/src/tests/include/her2.h @@ -107,11 +107,6 @@ class HPR2 : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(order, uplo, N, 1, alpha, offx, incx, offy, incy, offa, lda); - - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/her2k.h b/src/tests/include/her2k.h index 32e6172a..1a1b3cd1 100644 --- a/src/tests/include/her2k.h +++ b/src/tests/include/her2k.h @@ -137,11 +137,6 @@ class HER2K : public TestWithParam< rowsC = ldc; break; } - - printTestParams(order, uplo, transA, N, K, true, paramAlpha, - offa, lda, offB, ldb, true, paramBeta, offC, ldc); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/herk.h b/src/tests/include/herk.h index 7a5d5b50..a0376fe6 100644 --- a/src/tests/include/herk.h +++ b/src/tests/include/herk.h @@ -130,11 +130,6 @@ class HERK : public TestWithParam< rowsC = ldc; break; } - - printTestParams(order, uplo, transA, N, K, true, paramAlpha, - offA, lda, true, paramBeta, offC, ldc); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/iamax.h b/src/tests/include/iamax.h index b81c50a0..3aca87eb 100644 --- a/src/tests/include/iamax.h +++ b/src/tests/include/iamax.h @@ -59,9 +59,6 @@ class iAMAX : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(N, offx, incx); - ::std::cerr << "offiAmax = " << offiAmax << ::std::endl; } size_t N; diff --git a/src/tests/include/nrm2.h b/src/tests/include/nrm2.h index 86370672..0102102f 100644 --- a/src/tests/include/nrm2.h +++ b/src/tests/include/nrm2.h @@ -61,10 +61,6 @@ class NRM2 : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(N, offx, incx); - ::std::cerr << "offNRM2 = " << offNRM2 << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; diff --git a/src/tests/include/rot.h b/src/tests/include/rot.h index 54445091..b3014fbf 100644 --- a/src/tests/include/rot.h +++ b/src/tests/include/rot.h @@ -70,9 +70,6 @@ class ROT : public TestWithParam< { numCommandQueues = base->numCommandQueues(); } - - printTestParams(N, offa, incx, offb, incy, alpha, beta ); - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N, offa, offb; diff --git a/src/tests/include/rotg.h b/src/tests/include/rotg.h index 874f7577..3cec78bb 100644 --- a/src/tests/include/rotg.h +++ b/src/tests/include/rotg.h @@ -61,9 +61,6 @@ class ROTG : public TestWithParam< { numCommandQueues = base->numCommandQueues(); } - - printTestParams(offSA, offSB, offC, offS); - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t offSA, offSB, offC, offS; diff --git a/src/tests/include/rotm.h b/src/tests/include/rotm.h index 9600b5af..3775e2f2 100644 --- a/src/tests/include/rotm.h +++ b/src/tests/include/rotm.h @@ -70,9 +70,6 @@ class ROTM : public TestWithParam< { numCommandQueues = base->numCommandQueues(); } - - printTestParams(N, offa, incx, offb, incy, offc, alpha); - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N, offa, offb, offc; diff --git a/src/tests/include/rotmg.h b/src/tests/include/rotmg.h index c5357197..3966b86c 100644 --- a/src/tests/include/rotmg.h +++ b/src/tests/include/rotmg.h @@ -68,9 +68,6 @@ class ROTMG : public TestWithParam< { numCommandQueues = base->numCommandQueues(); } - - printTestParams(offBX, offCY, offa, offb, offc, alpha); - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } int offa, offb, offc, offBX, offCY; diff --git a/src/tests/include/sbmv.h b/src/tests/include/sbmv.h index 04289104..1253a9ea 100644 --- a/src/tests/include/sbmv.h +++ b/src/tests/include/sbmv.h @@ -93,11 +93,6 @@ class SBMV : public TestWithParam< KLU = KLU % N; lda = ::std::max(lda, (KLU+1)); - - printTestParams(order, uplo, N, KLU, paramAlpha, offA, - lda, offx, incx, paramBeta, offy, incy); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/scal.h b/src/tests/include/scal.h index 922407cf..1225954a 100644 --- a/src/tests/include/scal.h +++ b/src/tests/include/scal.h @@ -62,10 +62,6 @@ class SCAL : public TestWithParam< if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } - - printTestParams(N, paramAlpha, offx, incx); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; diff --git a/src/tests/include/spmv.h b/src/tests/include/spmv.h index dce82936..7bd245c1 100644 --- a/src/tests/include/spmv.h +++ b/src/tests/include/spmv.h @@ -104,11 +104,6 @@ class SPMV : public TestWithParam< if (base->useIncY()) { incy = base->incY(); } - - printTestParams(order, uplo, N, paramAlpha, offA, - 0, offx, incx, paramBeta, offy, incy); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/swap.h b/src/tests/include/swap.h index 6afda58d..c2f20551 100644 --- a/src/tests/include/swap.h +++ b/src/tests/include/swap.h @@ -68,9 +68,6 @@ class SWAPXY : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(N, offBX, incx, offCY, incy); - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; diff --git a/src/tests/include/symm.h b/src/tests/include/symm.h index 29214b31..b2f3cd97 100644 --- a/src/tests/include/symm.h +++ b/src/tests/include/symm.h @@ -120,11 +120,6 @@ class SYMM : public TestWithParam< ldc = ::std::max(ldc, M); break; } - - printTestParams(order, side, uplo, M, N, 1, alpha, 1, beta, lda, ldb, ldc, offa, offb, offc); - - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/symv.h b/src/tests/include/symv.h index f8f76a64..e4542228 100644 --- a/src/tests/include/symv.h +++ b/src/tests/include/symv.h @@ -154,11 +154,6 @@ class SYMV : public TestWithParam< columnsC = N; break; } - - printTestParams(order, uplo, N, useAlpha, base->alpha(), offsetA, lda, - incx, useBeta, base->beta(), incy); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/syr.h b/src/tests/include/syr.h index 056dcba1..fd169fe1 100644 --- a/src/tests/include/syr.h +++ b/src/tests/include/syr.h @@ -101,11 +101,6 @@ class SPR : public TestWithParam< // if (base->useAlpha()) { // alpha = base->Alpha(); // } - - printTestParams(order, uplo, N, alpha, offx, incx, offa, lda); - - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/syr2.h b/src/tests/include/syr2.h index 70456139..7f654c4e 100644 --- a/src/tests/include/syr2.h +++ b/src/tests/include/syr2.h @@ -108,11 +108,6 @@ class SPR2 : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(order, uplo, N, alpha, offx, incx, offy, incy, offa, lda); - - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/syr2k.h b/src/tests/include/syr2k.h index ff09c2fb..a1ebc1d4 100644 --- a/src/tests/include/syr2k.h +++ b/src/tests/include/syr2k.h @@ -141,11 +141,6 @@ class SYR2K : public TestWithParam< rowsC = ldc; break; } - - printTestParams(order, uplo, transA, N, K, useAlpha, base->alpha(), - offA, lda, offB, ldb, useBeta, base->beta(), offC, ldc); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/syrk.h b/src/tests/include/syrk.h index b855723b..4a0be453 100644 --- a/src/tests/include/syrk.h +++ b/src/tests/include/syrk.h @@ -125,11 +125,6 @@ class SYRK : public TestWithParam< rowsC = ldc; break; } - - printTestParams(order, uplo, transA, N, K, useAlpha, base->alpha(), - offA, lda, useBeta, base->beta(), offC, ldc); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/tbmv.h b/src/tests/include/tbmv.h index 19063d69..95f7974c 100644 --- a/src/tests/include/tbmv.h +++ b/src/tests/include/tbmv.h @@ -88,11 +88,6 @@ class TBMV : public TestWithParam< KLU = KLU % N; lda = ::std::max(lda, (KLU+1)); - - printTestParams(order, uplo, transA, diag, N, KLU, offA, - lda, offx, incx, 0, 1); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/tbsv.h b/src/tests/include/tbsv.h index 890e1c27..5096ed50 100644 --- a/src/tests/include/tbsv.h +++ b/src/tests/include/tbsv.h @@ -89,11 +89,6 @@ class TBSV : public TestWithParam< KLU = KLU % N; lda = ::std::max(lda, (KLU+1)); - - printTestParams(order, uplo, transA, diag, N, KLU, offA, - lda, offx, incx, 0, 1); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/trmm.h b/src/tests/include/trmm.h index b2e5482a..cc9d1a79 100644 --- a/src/tests/include/trmm.h +++ b/src/tests/include/trmm.h @@ -127,11 +127,6 @@ class TRMM : public TestWithParam< rowsB = ldb; break; } - - printTestParams(order, side, uplo, transA, diag, M, N, useAlpha, - base->alpha(), offA, lda, offB, ldb); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/trmv.h b/src/tests/include/trmv.h index 204bbce0..5b0c39f0 100644 --- a/src/tests/include/trmv.h +++ b/src/tests/include/trmv.h @@ -98,10 +98,6 @@ class TPMV : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(order, uplo, transA, diag, N, lda, incx, offa, offx); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/trsm.h b/src/tests/include/trsm.h index e0e90ea2..c509ed71 100644 --- a/src/tests/include/trsm.h +++ b/src/tests/include/trsm.h @@ -130,11 +130,6 @@ class TRSM : public TestWithParam< rowsB = ldb; break; } - - printTestParams(order, side, uplo, transA, diag, M, N, useAlpha, - base->alpha(), offA, lda, offB, ldb); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; diff --git a/src/tests/include/trsv.h b/src/tests/include/trsv.h index 6410d110..0f54cb99 100644 --- a/src/tests/include/trsv.h +++ b/src/tests/include/trsv.h @@ -97,10 +97,6 @@ class TPSV : public TestWithParam< if (base->useN()) { N = base->N(); } - - printTestParams(order, uplo, transA, diag, N, lda, incx, offa, offx); - ::std::cerr << "seed = " << seed << ::std::endl; - ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; From 22830771478415fc5236f077967b98872967733a Mon Sep 17 00:00:00 2001 From: Kent Knox Date: Mon, 18 Apr 2016 17:26:05 -0500 Subject: [PATCH 31/45] Device selection for test-correctness and test-functional test-functional and test-correctness with derivatives have been enhanced to be able to specify the device under test through the command line. The ordinals can be queried with clinfo program Externally available, comes in AMD SDK --platform-ord is an unsigned value picking the platform --device-ord is an unsigned value picking the device --device has been renamed to --device-type --- src/tests/BlasBase.cpp | 89 +++++------------- src/tests/cmdline.c | 40 +++++++- src/tests/correctness/test-correctness.cpp | 102 +++++++++++---------- src/tests/functional/test-functional.cpp | 100 ++++++++++---------- src/tests/include/BlasBase.h | 8 +- src/tests/include/cmdline.h | 6 +- 6 files changed, 173 insertions(+), 172 deletions(-) diff --git a/src/tests/BlasBase.cpp b/src/tests/BlasBase.cpp index c012803d..73a6f5e2 100644 --- a/src/tests/BlasBase.cpp +++ b/src/tests/BlasBase.cpp @@ -40,10 +40,10 @@ BlasBase::BlasBase() useNumCommandQueues_(false), numCommandQueues_(1), useAlpha_(false), useBeta_(false), useSeed_(false), useM_(false), useN_(false), useK_(false), - M_(0), N_(0), K_(0), + M_(0), N_(0), K_(0), devOrd_(0), platOrd_(0), useIncX_(false), useIncY_(false), incX_(0), incY_(0), - useImages_(false), devType_(CL_DEVICE_TYPE_GPU), imageA_(0), imageB_(0) + useImages_(false), devType_(CL_DEVICE_TYPE_DEFAULT), imageA_(0), imageB_(0) { memset(&alpha_, 0, sizeof(alpha_)); memset(&beta_, 0, sizeof(beta_)); @@ -57,7 +57,7 @@ BlasBase::~BlasBase() /* * Teardown() is disabled due to troubles with test interrupting * with CTRL-C in windows. This occurs since after pressing of these keys - * the OpenCL runtime is destroyed before calling global object destructors. + * the OpenCL runtime is destroyed before calling global object destructor's. */ #if 0 TearDown(); @@ -110,7 +110,6 @@ BlasBase::getDevice(cl_device_type type, const char* name, char *str; cl_platform_id *platforms, selPlatform = NULL; cl_uint nrPlatforms; - cl_device_info devInfo; nrPlatforms = getPlatforms(&platforms, &err); @@ -118,29 +117,9 @@ BlasBase::getDevice(cl_device_type type, const char* name, *error = CL_SUCCESS; } - /* - * If device name is not specified, then any AMD device is preferable. - * It there are not AMD devices of such a type presented in the system, - * then get a device of another vendor. If this is the additional device - * which is being tried to get, it must be supported by the same platform - * as the primary device does. - */ - - if (name == NULL) { - name = "Advanced Micro Devices, Inc."; - devInfo = CL_DEVICE_VENDOR; - } - else { - devInfo = CL_DEVICE_NAME; - type = CL_DEVICE_TYPE_ALL; - } - - for (p = 0; p < nrPlatforms; p++) { - cl_platform_id platform = platforms[p]; - err = clGetDeviceIDs(platform, type, 0, NULL, &nrDevices); - if (err == CL_DEVICE_NOT_FOUND) { - continue; - } + if (platOrd_ < nrPlatforms) { + platform_ = platforms[platOrd_]; + err = clGetDeviceIDs(platform_, type, 0, NULL, &nrDevices); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; @@ -152,7 +131,7 @@ BlasBase::getDevice(cl_device_type type, const char* name, } devices = new cl_device_id[nrDevices]; - err = clGetDeviceIDs(platform, type, nrDevices, devices, NULL); + err = clGetDeviceIDs(platform_, type, nrDevices, devices, NULL); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; @@ -161,40 +140,15 @@ BlasBase::getDevice(cl_device_type type, const char* name, return NULL; } - for (i = 0; i < nrDevices; i++) { - err = clGetDeviceInfo(devices[i], devInfo, 0, NULL, &sz); - if (err != CL_SUCCESS) { - continue; - } - str = new char[sz + 1]; - memset(str, 0, sz + 1); - err = clGetDeviceInfo(devices[i], devInfo, sz, str, NULL); - if (err != CL_SUCCESS) { - delete[] str; - continue; - } - if ((devInfo == CL_DEVICE_VENDOR) && (result == NULL) && - ((platform_ == NULL) || (platform == platform_))) { - - result = devices[i]; - selPlatform = platform; - } - printf("---- %s\n", str); - if (strcmp(str, name) == 0) { - //printf("---- %s\n", str); - platform_ = platform; - result = devices[i]; - delete[] str; - break; - } - delete[] str; + if (devOrd_ < nrDevices) { + result = devices[devOrd_]; } delete[] devices; devices = NULL; } - - if (platform_ == NULL) { - platform_ = selPlatform; + else + { + platform_ = NULL; } delete[] platforms; @@ -211,6 +165,7 @@ BlasBase::SetUp() cl_device_id devices[2] = {NULL, NULL}; primaryDevice_ = getDevice(devType_, devName_, &err); + if ((err != CL_SUCCESS) || (primaryDevice_ == NULL)) { ASSERT_EQ(CL_SUCCESS, clGetPlatformIDs(1, &platform_, NULL)); ASSERT_EQ(CL_SUCCESS, @@ -243,7 +198,7 @@ BlasBase::SetUp() printf("SetUp: Created context %p\n", context_); #endif printf("SetUp: about to create command queues\n"); - for (i = 0; i < MAX_COMMAND_QUEUES; i++) { + for (i = 0; i < numCommandQueues_; i++) { cl_device_id dev; dev = (i == addDevQueueIdx) ? additionalDevice_ : primaryDevice_; @@ -260,10 +215,9 @@ BlasBase::TearDown() { cl_uint i; - for (i = 0; i < MAX_COMMAND_QUEUES; i++) { + for (i = 0; i < numCommandQueues_; i++) { clReleaseCommandQueue(commandQueues_[i]); } - numCommandQueues_ = 1; if (context_ != NULL) { clReleaseContext(context_); @@ -282,20 +236,23 @@ BlasBase::initialized() } bool -BlasBase::setDeviceType(cl_device_type* devType, const char* devName) +BlasBase::setDeviceType(const TestParams& params) { - if (devType_ == *devType && devName_ == devName) { + // Early exit if no device state changed + if (devType_ == params.devType && devName_ == params.devName && platOrd_ == params.platOrd && devOrd_ == params.devOrd) { return true; } - devType_ = *devType; - devName_ = devName; + devType_ = params.devType; + devName_ = params.devName; + platOrd_ = params.platOrd; + devOrd_ = params.devOrd; if (!initialized()) { return true; } TearDown(); SetUp(); - *devType = devType_; + return initialized(); } diff --git a/src/tests/cmdline.c b/src/tests/cmdline.c index 259a9f2a..c9519cc6 100644 --- a/src/tests/cmdline.c +++ b/src/tests/cmdline.c @@ -24,7 +24,7 @@ static const char *testUsage = " [--seed s] [--alpha a] [--beta b] " "[--alpha-real a] [--beta-real b] [--alpha-imag a] [--beta-imag b] " - "[--use-images f] [--device dev] [--queues n]\n" + "[--use-images f] [--platform ordinal] [--device string] [--device-ord ordinal] [--queues n]\n" "\n" "seed - seed for the random number generator" "\n" @@ -42,8 +42,12 @@ static const char *testUsage = "\n" "use-images - allow the library to use images for computing" "\n" - "device - device to run the test on, 'cpu' or 'gpu'(default)" + "platform-ord - platform ordinal containing device of interest as reported by clinfo; (default 0)" "\n" + "device-ord - device ordinal as device under test as reported by clinfo; (default 0)" + "\n" + "device-type - device type to filter device enumeration: 'default', 'all', 'gpu' or 'cpu'\n" + "\t\tWith 'default', platform-ord && device-ord should both be 0\n" "queues - number of command queues to use" "\n" "Parameters defined through the command line are kept over the whole " @@ -169,7 +173,7 @@ setMult(SetterArg *sarg) } static int -setDevice(SetterArg *sarg) +setDevice_type(SetterArg *sarg) { if (!strcmp(sarg->arg, "cpu")) { sarg->params->devType = CL_DEVICE_TYPE_CPU; @@ -181,11 +185,37 @@ setDevice(SetterArg *sarg) sarg->params->devName = NULL; return 0; } + if (!strcmp(sarg->arg, "all")) { + sarg->params->devType = CL_DEVICE_TYPE_ALL; + sarg->params->devName = NULL; + return 0; + } + if (!strcmp(sarg->arg, "default")) { + sarg->params->devType = CL_DEVICE_TYPE_DEFAULT; + sarg->params->devName = NULL; + return 0; + } sarg->params->devName = sarg->arg; return 0; } +static int +setDevice(SetterArg *sarg) +{ + sarg->params->devOrd = atoi(sarg->arg); + + return 0; +} + +static int +setPlatform(SetterArg *sarg) +{ + sarg->params->platOrd = atoi(sarg->arg); + + return 0; +} + static int setNumCommandQueues(SetterArg *sarg) { @@ -202,7 +232,9 @@ static const CmdLineOpt opts[] = { {"alpha-imag", SET_ALPHA, setMult, MULT_ALPHA | MULT_IMAG_ONLY}, {"beta-real", SET_BETA, setMult, MULT_BETA | MULT_REAL_ONLY}, {"beta-imag", SET_BETA, setMult, MULT_BETA | MULT_IMAG_ONLY}, - {"device", SET_DEVICE_TYPE, setDevice, 0}, + {"platform-ord", SET_PLATFORM_ORD, setPlatform, 0 }, + {"device-type", SET_DEVICE_TYPE, setDevice_type, 0}, + {"device-ord", SET_DEVICE_ORD, setDevice, 0 }, {"queues", SET_NUM_COMMAND_QUEUES, setNumCommandQueues, 0}, }; static const unsigned int nrOpts = sizeof(opts) / sizeof(CmdLineOpt); diff --git a/src/tests/correctness/test-correctness.cpp b/src/tests/correctness/test-correctness.cpp index 67ac9715..3b80a039 100644 --- a/src/tests/correctness/test-correctness.cpp +++ b/src/tests/correctness/test-correctness.cpp @@ -3307,11 +3307,8 @@ INSTANTIATE_TEST_CASE_P(MultipleQueues, iAMAX, Combine( int main(int argc, char *argv[]) { - ::clMath::BlasBase *base; - TestParams params; int ret; - - if( (argc > 1) && ( !strcmp(argv[1], "--test-help") || !strcmp(argv[1], "-?") || !strcmp(argv[1], "-h") ) ) + if( (argc > 1) && ( !strcmp(argv[1], "--test-help") || !strcmp(argv[1], "-?") || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help") ) ) { printUsage("test-correctness"); ::testing::InitGoogleTest(&argc, argv); @@ -3331,58 +3328,63 @@ main(int argc, char *argv[]) } ::testing::InitGoogleTest(&argc, argv); - ::std::cerr << "Initialize OpenCL and clblas..." << ::std::endl; - base = ::clMath::BlasBase::getInstance(); + TestParams params; + params.optFlags = NO_FLAGS; + params.devType = CL_DEVICE_TYPE_ALL; + params.devName = NULL; + params.devOrd = 0; + params.platOrd = 0; + + if (argc != 1) { + if (parseBlasCmdLineArgs(argc, argv, ¶ms) != 0) { + printUsage(argv[0]); + return 1; + } + } + + ::std::cout << "Initialize default OpenCL and clblas..." << ::std::endl; + ::clMath::BlasBase* base = ::clMath::BlasBase::getInstance( ); if (base == NULL) { ::std::cerr << "Fatal error, OpenCL or clblas initialization failed! " - "Leaving the test." << ::std::endl; + "Leaving the test." << ::std::endl; return -1; } base->setSeed(DEFAULT_SEED); - if (argc != 1) { - params.optFlags = NO_FLAGS; - params.devType = CL_DEVICE_TYPE_GPU; - params.devName = NULL; - if (parseBlasCmdLineArgs(argc, argv, ¶ms) != 0) { - printUsage(argv[0]); - return 1; - } - if (params.optFlags & SET_SEED) { - base->setSeed(params.seed); - } - if (params.optFlags & SET_ALPHA) { - base->setAlpha(params.alpha); - } - if (params.optFlags & SET_BETA) { - base->setBeta(params.beta); - } - if (params.optFlags & SET_M) { - base->setM(params.M); - } - if (params.optFlags & SET_N) { - base->setN(params.N); - } - if (params.optFlags & SET_K) { - base->setK(params.K); - } - if (params.optFlags & SET_INCX) { - base->setIncX(params.incx); - } - if (params.optFlags & SET_INCY) { - base->setIncY(params.incy); - } - if (params.optFlags & SET_DEVICE_TYPE) { - if (!base->setDeviceType(¶ms.devType, params.devName)) { - ::std::cerr << "Fatal error, OpenCL or clblas " - "initialization failed! Leaving the test." << - ::std::endl; - return -1; - } - } - if (params.optFlags & SET_NUM_COMMAND_QUEUES) { - base->setNumCommandQueues(params.numCommandQueues); + if (params.optFlags & SET_SEED) { + base->setSeed(params.seed); + } + if (params.optFlags & SET_ALPHA) { + base->setAlpha(params.alpha); + } + if (params.optFlags & SET_BETA) { + base->setBeta(params.beta); + } + if (params.optFlags & SET_M) { + base->setM(params.M); + } + if (params.optFlags & SET_N) { + base->setN(params.N); + } + if (params.optFlags & SET_K) { + base->setK(params.K); + } + if (params.optFlags & SET_INCX) { + base->setIncX(params.incx); + } + if (params.optFlags & SET_INCY) { + base->setIncY(params.incy); + } + if (params.optFlags & SET_NUM_COMMAND_QUEUES) { + base->setNumCommandQueues(params.numCommandQueues); + } + if ((params.optFlags & SET_DEVICE_TYPE) || (params.optFlags & SET_PLATFORM_ORD) || (params.optFlags & SET_DEVICE_ORD)) { + if (!base->setDeviceType(params)) { + ::std::cerr << "Fatal error, OpenCL or clblas " + "initialization failed! Leaving the test." << + ::std::endl; + return -1; } } @@ -3407,7 +3409,7 @@ main(int argc, char *argv[]) } /* - * Explicitely tell the singleton to release all resources, + * Explicitly tell singleton to release all resources, * before we return from main. */ base->release( ); diff --git a/src/tests/functional/test-functional.cpp b/src/tests/functional/test-functional.cpp index c147b775..d5d0d4e9 100644 --- a/src/tests/functional/test-functional.cpp +++ b/src/tests/functional/test-functional.cpp @@ -24,66 +24,69 @@ int main(int argc, char *argv[]) { - ::clMath::BlasBase *base; - TestParams params; int ret; - - if ((argc > 1) && !strcmp(argv[1], "--test-help")) { + if ((argc > 1) && (!strcmp(argv[1], "--test-help") || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) { printUsage("test-functional"); + ::testing::InitGoogleTest(&argc, argv); return 0; } + TestParams params; + params.optFlags = NO_FLAGS; + params.devType = CL_DEVICE_TYPE_ALL; + params.devName = NULL; + params.devOrd = 0; + params.platOrd = 0; + + if (argc != 1) { + if (parseBlasCmdLineArgs(argc, argv, ¶ms) != 0) { + printUsage(argv[0]); + return 1; + } + } + ::testing::InitGoogleTest(&argc, argv); - ::std::cerr << "Initialize OpenCL and clblas..." << ::std::endl; - base = ::clMath::BlasBase::getInstance(); + ::std::cout << "Initialize default OpenCL and clblas..." << ::std::endl; + ::clMath::BlasBase *base = ::clMath::BlasBase::getInstance(); if (base == NULL) { ::std::cerr << "Fatal error, OpenCL or clblas initialization failed! " "Leaving the test." << ::std::endl; return -1; } - if (argc != 1) { - params.optFlags = NO_FLAGS; - params.devType = CL_DEVICE_TYPE_GPU; - params.devName = NULL; - if (parseBlasCmdLineArgs(argc, argv, ¶ms) != 0) { - printUsage(argv[0]); - return 1; - } - if (params.optFlags & SET_SEED) { - base->setSeed(params.seed); - } - if (params.optFlags & SET_ALPHA) { - base->setAlpha(params.alpha); - } - if (params.optFlags & SET_BETA) { - base->setBeta(params.beta); - } - if (params.optFlags & SET_M) { - base->setM(params.M); - } - if (params.optFlags & SET_N) { - base->setN(params.N); - } - if (params.optFlags & SET_K) { - base->setK(params.K); - } - if (params.optFlags & SET_INCX) { - base->setIncX(params.incx); - } - if (params.optFlags & SET_INCY) { - base->setIncY(params.incy); - } - if (params.optFlags & SET_DEVICE_TYPE) { - if (!base->setDeviceType(¶ms.devType, params.devName)) { - ::std::cerr << "Fatal error, OpenCL or clblas " - "initialization failed! Leaving the test." << - ::std::endl; - return -1; - } - } - if (params.optFlags & SET_NUM_COMMAND_QUEUES) { - base->setNumCommandQueues(params.numCommandQueues); + if (params.optFlags & SET_SEED) { + base->setSeed(params.seed); + } + if (params.optFlags & SET_ALPHA) { + base->setAlpha(params.alpha); + } + if (params.optFlags & SET_BETA) { + base->setBeta(params.beta); + } + if (params.optFlags & SET_M) { + base->setM(params.M); + } + if (params.optFlags & SET_N) { + base->setN(params.N); + } + if (params.optFlags & SET_K) { + base->setK(params.K); + } + if (params.optFlags & SET_INCX) { + base->setIncX(params.incx); + } + if (params.optFlags & SET_INCY) { + base->setIncY(params.incy); + } + if (params.optFlags & SET_NUM_COMMAND_QUEUES) { + base->setNumCommandQueues(params.numCommandQueues); + } + if ((params.optFlags & SET_DEVICE_TYPE) || (params.optFlags & SET_PLATFORM_ORD) || (params.optFlags & SET_DEVICE_ORD)) { + if (!base->setDeviceType( params )) { + ::std::cerr << "Fatal error, OpenCL or clblas " + "initialization failed! Leaving the test." << + ::std::endl; + return -1; } } @@ -101,6 +104,7 @@ main(int argc, char *argv[]) } */ + base->printEnvInfo(); ret = RUN_ALL_TESTS(); if (base->useImages()) { diff --git a/src/tests/include/BlasBase.h b/src/tests/include/BlasBase.h index bcde47f2..92454390 100644 --- a/src/tests/include/BlasBase.h +++ b/src/tests/include/BlasBase.h @@ -61,6 +61,8 @@ class BlasBase { cl_device_id additionalDevice_; cl_context context_; cl_command_queue commandQueues_[MAX_COMMAND_QUEUES]; + size_t devOrd_; + size_t platOrd_; bool useNumCommandQueues_; cl_uint numCommandQueues_; @@ -85,7 +87,7 @@ class BlasBase { cl_ulong imageA_; cl_ulong imageB_; - BlasBase(); + BlasBase( ); ~BlasBase(); BlasBase(const BlasBase &); // intentionally undefined BlasBase & operator=(const BlasBase &); // intentionally undefined @@ -101,7 +103,7 @@ class BlasBase { int primAdd); public: - static BlasBase* getInstance(); + static BlasBase* getInstance( ); cl_context context() { @@ -197,7 +199,7 @@ class BlasBase { useImages_ = (value != 0); } - bool setDeviceType(cl_device_type* devType, const char* devName); + bool setDeviceType(const TestParams& params ); cl_mem createEnqueueBuffer(const void *data, size_t matrSize, size_t off, cl_mem_flags mode); cl_mem readBuffer(void *ptr, size_t off, size_t size); diff --git a/src/tests/include/cmdline.h b/src/tests/include/cmdline.h index addb9292..68ddfba1 100644 --- a/src/tests/include/cmdline.h +++ b/src/tests/include/cmdline.h @@ -42,7 +42,9 @@ typedef enum SetoptFlags { SET_DEVICE_TYPE = (1 << 7), SET_INCX = (1 << 8), SET_INCY = (1 << 9), - SET_NUM_COMMAND_QUEUES = (1 << 10) + SET_NUM_COMMAND_QUEUES = (1 << 10), + SET_DEVICE_ORD = (1 << 11), + SET_PLATFORM_ORD = (1 << 12), } SetoptFlags; typedef struct TestParams { @@ -83,6 +85,8 @@ typedef struct TestParams { int useImages; cl_device_type devType; const char* devName; + size_t devOrd; + size_t platOrd; cl_uint numCommandQueues; SetoptFlags optFlags; } TestParams; From da0fd1b706dc7977175d89f483f65fb98131d9de Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Wed, 20 Apr 2016 17:50:11 +0200 Subject: [PATCH 32/45] Make installing source tree optional --- src/library/CMakeLists.txt | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt index 1925b6c2..f0c6111f 100644 --- a/src/library/CMakeLists.txt +++ b/src/library/CMakeLists.txt @@ -91,6 +91,8 @@ option( PRECOMPILE_GEMM_TRANS_CN "AutoGemm: pre-compile CN transpose cases" OFF) option( PRECOMPILE_GEMM_TRANS_CT "AutoGemm: pre-compile CT transpose cases" OFF) option( PRECOMPILE_GEMM_TRANS_CC "AutoGemm: pre-compile CC transpose cases" OFF) +option( INSTALL_SRC "Install source tree" OFF ) + set( AUTOGEMM_ARCHITECTURE "Hawaii" CACHE STRING "AutoGemm: device for kernel selection logic" ) set_property( CACHE AUTOGEMM_ARCHITECTURE PROPERTY STRINGS "Hawaii" "Fiji" ) @@ -920,8 +922,11 @@ install( DIRECTORY ${PROJECT_BINARY_DIR}/staging/ CONFIGURATIONS Debug FILES_MATCHING PATTERN "*.pdb" ) +if(INSTALL_SRC) # Install a snapshot of the source as it was for this build; useful for the .pdb's -install( DIRECTORY ${PROJECT_SOURCE_DIR} - DESTINATION ${CLBLAS_RUNTIME_DESTINATION} - OPTIONAL - CONFIGURATIONS Debug ) + install( DIRECTORY ${PROJECT_SOURCE_DIR} + DESTINATION ${CLBLAS_RUNTIME_DESTINATION} + OPTIONAL + CONFIGURATIONS Debug ) +endif() + From e0df18b178ca36531ca4288c6469f0593c9ebea8 Mon Sep 17 00:00:00 2001 From: Kent Knox Date: Wed, 20 Apr 2016 16:57:58 -0500 Subject: [PATCH 33/45] Removing the pedantic flag from gcc compiles The library was not developed with the pedantic warning flag enabled, and the build outputs a volume of verbose warning messages on every build. It's not currently helpful to have this enabled. This flag should be enabled again as the warnings get fixed, to enable a more robust library. Various warnings and #pragmas fixed that were remaining. Changed the allocation of a temp buffer in corr-trmv.cpp --- .gitignore | 3 - src/CMakeLists.txt | 4 +- src/FindOpenCL.cmake | 143 +++++++++++++----- src/library/blas/AutoGemm/Includes.py | 8 +- src/library/blas/AutoGemm/KernelOpenCL.py | 2 +- src/library/blas/gens/trmm.c | 2 +- .../blas/trtri/diag_dtrtri_lower_128_16.cpp | 1 - .../blas/trtri/diag_dtrtri_upper_128_16.cpp | 15 +- .../blas/trtri/diag_dtrtri_upper_192_12.cpp | 9 +- .../triple_dgemm_update_128_16_PART1_L.cpp | 9 +- .../triple_dgemm_update_128_16_PART2_L.cpp | 1 - .../trtri/triple_dgemm_update_128_16_R.cpp | 11 +- .../triple_dgemm_update_128_32_PART1_L.cpp | 7 +- .../triple_dgemm_update_128_32_PART1_R.cpp | 7 +- .../triple_dgemm_update_128_32_PART2_L.cpp | 1 - .../triple_dgemm_update_128_32_PART2_R.cpp | 1 - .../triple_dgemm_update_128_64_PART1_L.cpp | 7 +- .../triple_dgemm_update_128_64_PART1_R.cpp | 5 +- .../triple_dgemm_update_128_64_PART2_L.cpp | 1 - .../triple_dgemm_update_128_64_PART2_R.cpp | 1 - ...riple_dgemm_update_128_ABOVE64_PART1_L.cpp | 7 +- ...riple_dgemm_update_128_ABOVE64_PART1_R.cpp | 5 +- ...riple_dgemm_update_128_ABOVE64_PART2_L.cpp | 1 - ...riple_dgemm_update_128_ABOVE64_PART2_R.cpp | 1 - ...riple_dgemm_update_128_ABOVE64_PART3_L.cpp | 1 - ...riple_dgemm_update_128_ABOVE64_PART3_R.cpp | 1 - .../trtri/triple_dgemm_update_192_12_R.cpp | 5 +- .../triple_dgemm_update_192_24_PART1_R.cpp | 1 - .../triple_dgemm_update_192_24_PART2_R.cpp | 1 - .../triple_dgemm_update_192_48_PART1_R.cpp | 3 +- .../triple_dgemm_update_192_48_PART2_R.cpp | 1 - .../triple_dgemm_update_192_96_PART1_R.cpp | 3 +- .../triple_dgemm_update_192_96_PART2_R.cpp | 1 - src/library/blas/xgemm.cc | 33 ++-- src/tests/CMakeLists.txt | 26 ++-- src/tests/correctness/corr-trmv.cpp | 2 +- src/tests/include/cmdline.h | 2 +- src/tests/include/matrix.h | 118 +++++++-------- 38 files changed, 240 insertions(+), 210 deletions(-) diff --git a/.gitignore b/.gitignore index bb362782..d25acd55 100644 --- a/.gitignore +++ b/.gitignore @@ -24,8 +24,5 @@ # vim temp files .*.swp -src/build/ - # python compiled files *.pyc - diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7d90f28a..33a91ee2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -266,7 +266,7 @@ if( BUILD_TEST ) endif( ) # This will define OPENCL_FOUND -find_package( OpenCL ) +find_package( OpenCL ${OPENCL_VERSION} ) # Find Boost on the system, and configure the type of boost build we want set( Boost_USE_MULTITHREADED ON ) @@ -288,7 +288,7 @@ endif() # Turn on maximum compiler verbosity if(CMAKE_COMPILER_IS_GNUCXX) - add_definitions(-pedantic -Wall -Wextra + add_definitions(# -pedantic -Wall -Wextra -D_POSIX_C_SOURCE=199309L -D_XOPEN_SOURCE=500 ) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 -Wstrict-prototypes" CACHE STRING diff --git a/src/FindOpenCL.cmake b/src/FindOpenCL.cmake index 746fbe61..9810dd29 100644 --- a/src/FindOpenCL.cmake +++ b/src/FindOpenCL.cmake @@ -1,5 +1,5 @@ # ######################################################################## -# Copyright 2013 Advanced Micro Devices, Inc. +# Copyright 2015 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ # limitations under the License. # ######################################################################## - # Locate an OpenCL implementation. # Currently supports AMD APP SDK (http://developer.amd.com/sdks/AMDAPPSDK/Pages/default.aspx/) # @@ -46,60 +45,122 @@ # target_link_libraries(foo ${OPENCL_LIBRARIES}) # #----------------------- +include( CheckSymbolExists ) +include( CMakePushCheckState ) + +if( DEFINED OPENCL_ROOT OR DEFINED ENV{OPENCL_ROOT}) + message( STATUS "Defined OPENCL_ROOT: ${OPENCL_ROOT}, ENV{OPENCL_ROOT}: $ENV{OPENCL_ROOT}" ) +endif( ) find_path(OPENCL_INCLUDE_DIRS - NAMES OpenCL/cl.h CL/cl.h - HINTS - ${OPENCL_ROOT}/include - $ENV{AMDAPPSDKROOT}/include - $ENV{CUDA_PATH}/include - PATHS - /usr/include - /usr/local/include - /usr/local/cuda/include - /opt/cuda/include - DOC "OpenCL header file path" + NAMES OpenCL/cl.h CL/cl.h + HINTS + ${OPENCL_ROOT}/include + $ENV{OPENCL_ROOT}/include + $ENV{AMDAPPSDKROOT}/include + $ENV{CUDA_PATH}/include + PATHS + /usr/include + /usr/local/include + /usr/local/cuda/include + DOC "OpenCL header file path" ) mark_as_advanced( OPENCL_INCLUDE_DIRS ) +message( STATUS "OPENCL_INCLUDE_DIRS: ${OPENCL_INCLUDE_DIRS}" ) + +set( OpenCL_VERSION "0.0" ) + +cmake_push_check_state( RESET ) +set( CMAKE_REQUIRED_INCLUDES "${OPENCL_INCLUDE_DIRS}" ) + +# Bug in check_symbol_exists prevents us from specifying a list of files, so we loop +# Only 1 of these files will exist on a system, so the other file will not clobber the output variable +if( APPLE ) + set( CL_HEADER_FILE "OpenCL/cl.h" ) +else( ) + set( CL_HEADER_FILE "CL/cl.h" ) +endif( ) + +check_symbol_exists( CL_VERSION_2_0 ${CL_HEADER_FILE} HAVE_CL_2_0 ) +check_symbol_exists( CL_VERSION_1_2 ${CL_HEADER_FILE} HAVE_CL_1_2 ) +check_symbol_exists( CL_VERSION_1_1 ${CL_HEADER_FILE} HAVE_CL_1_1 ) +# message( STATUS "HAVE_CL_2_0: ${HAVE_CL_2_0}" ) +# message( STATUS "HAVE_CL_1_2: ${HAVE_CL_1_2}" ) +# message( STATUS "HAVE_CL_1_1: ${HAVE_CL_1_1}" ) + +# set OpenCL_VERSION to the highest detected version +if( HAVE_CL_2_0 ) + set( OpenCL_VERSION "2.0" ) +elseif( HAVE_CL_1_2 ) + set( OpenCL_VERSION "1.2" ) +elseif( HAVE_CL_1_1 ) + set( OpenCL_VERSION "1.1" ) +endif( ) + +cmake_pop_check_state( ) # Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ) +if( LIB64 ) + message( STATUS "FindOpenCL searching for 64-bit libraries" ) +else( ) + message( STATUS "FindOpenCL searching for 32-bit libraries" ) +endif( ) if( LIB64 ) - find_library( OPENCL_LIBRARIES - NAMES OpenCL - HINTS - ${OPENCL_ROOT}/lib - $ENV{AMDAPPSDKROOT}/lib - $ENV{CUDA_PATH}/lib - DOC "OpenCL dynamic library path" - PATH_SUFFIXES x86_64 x64 x86_64/sdk - PATHS - /usr/lib - /usr/local/cuda/lib - /opt/cuda/lib - ) + find_library( OPENCL_LIBRARIES + NAMES OpenCL + HINTS + ${OPENCL_ROOT}/lib + $ENV{OPENCL_ROOT}/lib + $ENV{AMDAPPSDKROOT}/lib + $ENV{CUDA_PATH}/lib + DOC "OpenCL dynamic library path" + PATH_SUFFIXES x86_64 x64 x86_64/sdk + PATHS + /usr/lib + /usr/local/cuda/lib + ) else( ) - find_library( OPENCL_LIBRARIES - NAMES OpenCL - HINTS - ${OPENCL_ROOT}/lib - $ENV{AMDAPPSDKROOT}/lib - $ENV{CUDA_PATH}/lib - DOC "OpenCL dynamic library path" - PATH_SUFFIXES x86 Win32 - - PATHS - /usr/lib - /usr/local/cuda/lib - /opt/cuda/lib - ) + find_library( OPENCL_LIBRARIES + NAMES OpenCL + HINTS + ${OPENCL_ROOT}/lib + $ENV{OPENCL_ROOT}/lib + $ENV{AMDAPPSDKROOT}/lib + $ENV{CUDA_PATH}/lib + DOC "OpenCL dynamic library path" + PATH_SUFFIXES x86 Win32 + PATHS + /usr/lib + /usr/local/cuda/lib + ) endif( ) mark_as_advanced( OPENCL_LIBRARIES ) +# message( STATUS "OpenCL_FIND_VERSION: ${OpenCL_FIND_VERSION}" ) +if( OpenCL_VERSION VERSION_LESS OpenCL_FIND_VERSION ) + message( FATAL_ERROR "Requested OpenCL version: ${OpenCL_FIND_VERSION}, Found OpenCL version: ${OpenCL_VERSION}" ) +endif( ) + +# If we asked for OpenCL 1.2, and we found a version installed greater than that, pass the 'use deprecated' flag +if( (OpenCL_FIND_VERSION VERSION_LESS "2.0") AND (OpenCL_VERSION VERSION_GREATER OpenCL_FIND_VERSION) ) + add_definitions( -DCL_USE_DEPRECATED_OPENCL_2_0_APIS ) + + # If we asked for OpenCL 1.1, and we found a version installed greater than that, pass the 'use deprecated' flag + if( (OpenCL_FIND_VERSION VERSION_LESS "1.2") AND (OpenCL_VERSION VERSION_GREATER OpenCL_FIND_VERSION) ) + add_definitions( -DCL_USE_DEPRECATED_OPENCL_1_1_APIS ) + endif( ) +endif( ) + include( FindPackageHandleStandardArgs ) -FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS ) +FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL + REQUIRED_VARS OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS + VERSION_VAR OpenCL_VERSION + ) if( NOT OPENCL_FOUND ) message( STATUS "FindOpenCL looked for libraries named: OpenCL" ) +else( ) + message(STATUS "FindOpenCL ${OPENCL_LIBRARIES}, ${OPENCL_INCLUDE_DIRS}") endif() diff --git a/src/library/blas/AutoGemm/Includes.py b/src/library/blas/AutoGemm/Includes.py index 3c8435fa..0f616956 100644 --- a/src/library/blas/AutoGemm/Includes.py +++ b/src/library/blas/AutoGemm/Includes.py @@ -113,7 +113,7 @@ def addKernel(self, kernel): self.cppStr += "unsigned char *%s_bin = 0;\n" % kernelName self.cppStr += " size_t %s_binSize = 0;\n" % kernelName self.cppStr += "#else\n" - self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName + # self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName self.cppStr += "#endif\n" kernelName = kernel.getRowName() @@ -123,7 +123,7 @@ def addKernel(self, kernel): self.cppStr += "unsigned char *%s_bin = 0;\n" % kernelName self.cppStr += " size_t %s_binSize = 0;\n" % kernelName self.cppStr += "#else\n" - self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName + # self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName self.cppStr += "#endif\n" kernelName = kernel.getColName() @@ -133,7 +133,7 @@ def addKernel(self, kernel): self.cppStr += "unsigned char *%s_bin = 0;\n" % kernelName self.cppStr += " size_t %s_binSize = 0;\n" % kernelName self.cppStr += "#else\n" - self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName + # self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName self.cppStr += "#endif\n" kernelName = kernel.getCornerName() @@ -143,7 +143,7 @@ def addKernel(self, kernel): self.cppStr += "unsigned char *%s_bin = 0;\n" % kernelName self.cppStr += " size_t %s_binSize = 0;\n" % kernelName self.cppStr += "#else\n" - self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName + # self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName self.cppStr += "#endif\n" self.incFile.write( self.incStr ) diff --git a/src/library/blas/AutoGemm/KernelOpenCL.py b/src/library/blas/AutoGemm/KernelOpenCL.py index d7835d56..87a56761 100644 --- a/src/library/blas/AutoGemm/KernelOpenCL.py +++ b/src/library/blas/AutoGemm/KernelOpenCL.py @@ -482,7 +482,7 @@ def writeOpenCLKernelToFile(kernel): kernelFile.write("\";\n") kernelFile.write("\n") kernelFile.write("#else\n") - kernelFile.write("#pragma message(\"AutoGemmKernelSources.cpp: %s was overriden by user kernel.\")\n" % kernel.getName() ) + # kernelFile.write("#pragma message(\"AutoGemmKernelSources.cpp: %s was overriden by user kernel.\")\n" % kernel.getName() ) kernelFile.write("#endif\n") kernelFile.close() diff --git a/src/library/blas/gens/trmm.c b/src/library/blas/gens/trmm.c index 7655af34..0c8f8b4f 100644 --- a/src/library/blas/gens/trmm.c +++ b/src/library/blas/gens/trmm.c @@ -1245,7 +1245,7 @@ static int trmmGetDefaultDecomp( PGranularity *pgran, unsigned int subdimsNum, void *pArgs) { - (void*)subdimsNum; + DUMMY_ARG_USAGE(subdimsNum); if ( NULL == pArgs ) { return -EINVAL; diff --git a/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp b/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp index 1f7c19c1..f3d6ca50 100644 --- a/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp +++ b/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_DIAG_DTRTRI_LOWER_128_16_SRC_CPP #define KERNEL_DIAG_DTRTRI_LOWER_128_16_SRC_CPP -#pragma message("#define KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp b/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp index bc9c2961..f039b275 100644 --- a/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp +++ b/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP #define KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP -#pragma message("#define KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -64,17 +63,17 @@ uint na)\n {\n if(tx <= i && i+bx*BLOCK_SIZE < na )\n {\n - Bs[i*BLOCK_SIZE+tx] = *(Aoff+i*lda+tx);\n + Bs[i*BLOCK_SIZE+tx] = *(Aoff+i*lda+tx);\n }\n else\n {\n Bs[i*BLOCK_SIZE+tx] = ZERO;\n }\n - }\n + }\n // read in the whole square block of my A and zero out the non data triangular - + // Synchronize to make sure the matrices are loaded - //__syncthreads(); + //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE);\n // solve the diagonals @@ -92,7 +91,7 @@ uint na)\n else \n {\n Bs[tx*BLOCK_SIZE+tx] = ONE / ( Bs[tx*BLOCK_SIZE+tx]) ;\n - }\n + }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n @@ -139,14 +138,14 @@ uint na)\n // __syncthreads(); barrier(CLK_LOCAL_MEM_FENCE);\n }\n - + // write back A _Pragma("unroll")\n for( i=0; i < BLOCK_SIZE; i++ )\n {\n *(d_dinvA+i*NB+tx) = Bs[i*BLOCK_SIZE+tx];\n }\n - + }\n // end of kernel ); diff --git a/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp b/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp index 0ffbebf7..0d81ee20 100644 --- a/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp +++ b/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_DIAG_DTRTRI_UPPER_192_12_SRC_CPP #define KERNEL_DIAG_DTRTRI_UPPER_192_12_SRC_CPP -#pragma message("#define KERNEL_DIAG_DTRTRI_UPPER_192_12_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -43,10 +42,10 @@ double neg_switcher; \n // Thread index int tx = get_local_id(0); \n -// Thread index +// Thread index int gx = get_global_id(0); \n -// Block index +// Block index int bx = get_group_id(0); \n A = A + offA; \n @@ -56,7 +55,7 @@ int NumBLperNB = NB / BLOCK_SIZE; \n d_dinvA += bx / NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE); \n __local double Bs[BLOCK_SIZE*BLOCK_SIZE]; \n -__local double workspace[BLOCK_SIZE];\n // workspace used to store the current working column +__local double workspace[BLOCK_SIZE];\n // workspace used to store the current working column // load A \n _Pragma("unroll")\n @@ -74,7 +73,7 @@ for (i = 0; i < BLOCK_SIZE; i++)\n // read in the whole square block of my A and zero out the non data triangular // Synchronize to make sure the matrices are loaded -//__syncthreads(); +//__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n // solve the diagonals diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp index c0e3b4cd..f0c041fb 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp @@ -5,7 +5,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART1_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART1_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART1_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -74,13 +73,13 @@ Ain = Ain + offAin; \n int ya = page*blk * 2; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n - maxA = lda*na; \n // macro READA will detect overflow on y dimension + maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n @@ -139,7 +138,7 @@ Ain = Ain + offAin; \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n - + B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp index 71c13dc2..dbffeb95 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp @@ -5,7 +5,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART2_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART2_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART2_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp index 237d3fe1..fd410a9a 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp @@ -1,14 +1,13 @@ /******************************************************************************* * Hand-tuned kernel - + * B21 = -inv(A11)*A12*inv(A22) * 16 to 32 - + ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_16_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -76,13 +75,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2 + blk; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n - maxA = lda*na; \n // macro READA will detect overflow on y dimension + maxA = lda*na; \n // macro READA will detect overflow on y dimension else - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp index 46b7e970..e4bde337 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp @@ -6,7 +6,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -73,13 +72,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2;\n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n - maxA = lda*na; \n // macro READA will detect overflow on y dimension + maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp index 3358af68..43760b63 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -74,13 +73,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2 + blk; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n - maxA = lda*na; \n // macro READA will detect overflow on y dimension + maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp index e480d6bb..12efa1a4 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp @@ -6,7 +6,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp index 6c04dee7..f0df0698 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp index eef824c7..11fa10b6 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -73,13 +72,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n - maxA = lda*na; \n // macro READA will detect overflow on y dimension + maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0;\n // there is already an overflow on xa + maxA = 0;\n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp index 0f64809d..ad5d5487 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -74,13 +73,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2 + blk; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp index 83e0c7e4..cf387855 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp index 5ce3e42a..923f4763 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp index af7f518f..31a97fad 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -74,13 +73,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n - maxA = lda*na; \n // macro READA will detect overflow on y dimension + maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp index 51a3e87a..315908ed 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -74,13 +73,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2 + blk; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp index 674fdd58..a4cd85c6 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp index a45494b5..f13e19b9 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp index d2077bf1..b576114e 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp @@ -6,7 +6,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp index 004a8d20..adb22d74 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp @@ -8,7 +8,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp index 79bc4c01..4d645bc2 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp @@ -1,14 +1,13 @@ /******************************************************************************* * Hand-tuned kernel - + * B21 = -inv(A11)*A12*inv(A22) * 12 to 24 - + ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_12_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_12_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_12_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp index 6b62eb44..f6465d37 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART1_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp index d8c2f992..1e46a8d8 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART2_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp index dafa65b9..3dc05169 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART1_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -54,7 +53,7 @@ const char * const triple_dgemm_update_192_48_PART1_R_src = STRINGIFY( //each workgroup loads half of B (up or down) B = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + blk + gidy*(blk / 2)*NB; \n - //decide invA12 location for each page; + //decide invA12 location for each page; //Actually this will be stored in invA21 temporarily //each workgroup writes 1/4 of C C = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + gidx % 2 * (blk / 2) + gidy*(blk / 2)*NB; \n diff --git a/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp index 4571112d..37ea0a2a 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART2_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp index 8e58ab65..1416ff3c 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART1_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -55,7 +54,7 @@ const char * const triple_dgemm_update_192_96_PART1_R_src = STRINGIFY( //each workgroup loads 1/4 of B (up or down) B = d_dinvA + page_block*NB*NB + blk*NB + blk + gidy*(blk / 4)*NB; \n - //decide invA12 location for each page; + //decide invA12 location for each page; //Actually this will be stored in invA21 temporarily //each workgroup writes 1/4*1/4 of C C = d_dinvA + page_block*NB*NB + blk*NB + gidx % 4 * (blk / 4) + gidy*(blk / 4)*NB; \n diff --git a/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp index 2ff217d1..9e961ffe 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART2_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/xgemm.cc b/src/library/blas/xgemm.cc index eb781127..a2c6cb00 100644 --- a/src/library/blas/xgemm.cc +++ b/src/library/blas/xgemm.cc @@ -170,7 +170,7 @@ void makeGemmKernel( #if defined( _WIN32 ) __declspec( thread ) static kernel_map_t *kernel_map = 0; #else - __thread static kernel_map_t *kernel_map = 0; + static __thread kernel_map_t *kernel_map = 0; #endif if (!kernel_map) { kernel_map = new kernel_map_t(); @@ -317,11 +317,11 @@ void makeGemmKernel( * get precision string *****************************************************************************/ template -char * getPrecision(); -template<> char * getPrecision() { return "s"; } -template<> char * getPrecision() { return "d"; } -template<> char * getPrecision() { return "c"; } -template<> char * getPrecision() { return "z"; } +const char * getPrecision(); +template<> const char * getPrecision() { return "s"; } +template<> const char * getPrecision() { return "d"; } +template<> const char * getPrecision() { return "c"; } +template<> const char * getPrecision() { return "z"; } /****************************************************************************** @@ -500,7 +500,7 @@ clblasGemm( &unroll); // make sure gemmSelectKernel found a valid kernel if (!tileKernelSource) { - printf("ERROR: gemmSelectKernel() couldn't find kernel(s) for { order=%s, transA=%s, transB=%s, M=%llu, N=%llu, K=%llu, beta=%u, onept=%f }\n", + printf("ERROR: gemmSelectKernel() couldn't find kernel(s) for { order=%s, transA=%s, transB=%s, M=%u, N=%u, K=%u, beta=%u, onept=%f }\n", order==clblasColumnMajor ? "ColMajor" : "RowMajor", transA==clblasNoTrans ? "N" : transA==clblasTrans ? "T" : "C", transB==clblasNoTrans ? "N" : transB==clblasTrans ? "T" : "C", @@ -566,8 +566,7 @@ clblasGemm( /****************************************************************************** * Build kernels *****************************************************************************/ - - + cl_kernel tileClKernel = NULL; cl_kernel rowClKernel = NULL; cl_kernel colClKernel = NULL; @@ -688,14 +687,14 @@ clblasSgemm( clblasErr = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + if (K != 0) { //check matrix A clblasErr = checkMatrixSizes(TYPE_FLOAT, order, transA, M, K, A, offA, lda, A_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + //check matrix B clblasErr = checkMatrixSizes(TYPE_FLOAT, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET); if (clblasErr != clblasSuccess) @@ -748,14 +747,14 @@ clblasDgemm( clblasOrder order, clblasErr = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + if (K != 0) { //check matrix A clblasErr = checkMatrixSizes(TYPE_DOUBLE, order, transA, M, K, A, offA, lda, A_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + //check matrix B clblasErr = checkMatrixSizes(TYPE_DOUBLE, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET); if (clblasErr != clblasSuccess) @@ -809,14 +808,14 @@ clblasCgemm( clblasErr = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + if (K != 0) { //check matrix A clblasErr = checkMatrixSizes(TYPE_COMPLEX_FLOAT, order, transA, M, K, A, offA, lda, A_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + //check matrix B clblasErr = checkMatrixSizes(TYPE_COMPLEX_FLOAT, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET); if (clblasErr != clblasSuccess) @@ -870,14 +869,14 @@ clblasZgemm( clblasErr = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + if (K != 0) { //check matrix A clblasErr = checkMatrixSizes(TYPE_COMPLEX_DOUBLE, order, transA, M, K, A, offA, lda, A_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + //check matrix B clblasErr = checkMatrixSizes(TYPE_COMPLEX_DOUBLE, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET); if (clblasErr != clblasSuccess) diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 9ecfd13e..b3944aa5 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -1,12 +1,12 @@ # ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -251,7 +251,7 @@ endif() # Having problems on build server, compiling gtest headers with -pedantic; disabling detection of long long # http://code.google.com/p/googletest/issues/detail?id=334 if( CMAKE_COMPILER_IS_GNUCXX ) - add_definitions( -Wno-long-long ) + add_definitions( -Wno-long-long -Wno-variadic-macros ) endif( ) if( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" ) @@ -259,7 +259,7 @@ if( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" ) # By default, -Mipa=fast is used, and this does not mix well with the cl compiler string( REPLACE "-Mipa=fast" "" CMAKE_Fortran_FLAGS_RELEASE ${CMAKE_Fortran_FLAGS_RELEASE} ) - + # In windows, dynamically link to the C runtime, and tell fortran linker to not include default main subroutine if( WIN32 ) set( CMAKE_EXE_LINKER_FLAGS "-Bdynamic -Mnostartup ${CMAKE_EXE_LINKER_FLAGS}" ) @@ -296,7 +296,7 @@ if( GTEST_FOUND ) ${clBLAS_SOURCE_DIR}/tests/include ${clBLAS_SOURCE_DIR}/include) add_definitions(-DCORR_TEST_WITH_ACML) - + add_executable(test-correctness ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL} ${CORR_HEADERS} ${TESTS_HEADERS}) set_target_properties( test-correctness PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) @@ -311,8 +311,8 @@ if( GTEST_FOUND ) set_target_properties(test-short PROPERTIES COMPILE_DEFINITIONS SHORT_TESTS) set_target_properties( test-short PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) - # The build server builds the library with gcc 4.1.2 to support Red Hat 5.5, but the test programs must be built with - # gcc > 4.3.2 to support ACML. + # The build server builds the library with gcc 4.1.2 to support Red Hat 5.5, but the test programs must be built with + # gcc > 4.3.2 to support ACML. # If the runtime is being built by the project, use it, otherwise link to a runtime library specified in the install prefix if( BUILD_RUNTIME ) target_link_libraries(test-correctness ${ACML_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) @@ -350,7 +350,7 @@ if( GTEST_FOUND ) set_target_properties( test-medium PROPERTIES LINKER_LANGUAGE Fortran ) set_target_properties( test-short PROPERTIES LINKER_LANGUAGE Fortran ) endif( ) - + if( BUILD_RUNTIME ) if( NETLIB_FOUND ) target_link_libraries(test-correctness ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) @@ -373,7 +373,7 @@ if( GTEST_FOUND ) endif( ) endif( ) endif( ) - + set_property( TARGET test-correctness PROPERTY FOLDER "Test") set_property( TARGET test-medium PROPERTY FOLDER "Test") set_property( TARGET test-short PROPERTY FOLDER "Test") @@ -384,7 +384,7 @@ if( GTEST_FOUND ) LIBRARY DESTINATION lib${SUFFIX_LIB} ARCHIVE DESTINATION lib${SUFFIX_LIB}/import ) - + get_target_property( testLocation test-correctness LOCATION ) configure_file( @@ -395,7 +395,7 @@ if( GTEST_FOUND ) # Register script at run at install time to analyze the executable and copy dependencies into package install( SCRIPT "${CMAKE_CURRENT_BINARY_DIR}/copyTestDependencies.cmake") - + if( ACML_FOUND ) include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/tests/include ${clBLAS_SOURCE_DIR}/include) @@ -430,7 +430,7 @@ if( GTEST_FOUND ) add_executable(test-functional ${SRC_FUNC} ${SRC_COMMON} ${SRC_COMMON_TIMER} ${FUNC_HEADERS} ${TESTS_HEADERS}) - + set_target_properties( test-functional PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) if( BUILD_RUNTIME ) target_link_libraries(test-functional ${GTEST_LIBRARIES} ${TIME_LIBRARY} ${THREAD_LIBRARY} clBLAS ) diff --git a/src/tests/correctness/corr-trmv.cpp b/src/tests/correctness/corr-trmv.cpp index 7e97d6c3..a8c7151a 100644 --- a/src/tests/correctness/corr-trmv.cpp +++ b/src/tests/correctness/corr-trmv.cpp @@ -127,7 +127,7 @@ trmvCorrectnessTest(TestParams *params) // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa)* sizeof(*A), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(clblasX, (lengthX + params->offBX)* sizeof(*clblasX), 0, CL_MEM_WRITE_ONLY); - bufXTemp = base->createEnqueueBuffer(NULL, lengthX * sizeof(*clblasX), 0, CL_MEM_READ_ONLY); + bufXTemp = base->createEnqueueBuffer(NULL, lengthX * sizeof(*clblasX), 0, CL_MEM_READ_WRITE); //printData( "bufX", blasX, lengthX, 1, lengthX); //printData( "clblasX", clblasX, lengthX, 1, lengthX); diff --git a/src/tests/include/cmdline.h b/src/tests/include/cmdline.h index 68ddfba1..b7679732 100644 --- a/src/tests/include/cmdline.h +++ b/src/tests/include/cmdline.h @@ -44,7 +44,7 @@ typedef enum SetoptFlags { SET_INCY = (1 << 9), SET_NUM_COMMAND_QUEUES = (1 << 10), SET_DEVICE_ORD = (1 << 11), - SET_PLATFORM_ORD = (1 << 12), + SET_PLATFORM_ORD = (1 << 12) } SetoptFlags; typedef struct TestParams { diff --git a/src/tests/include/matrix.h b/src/tests/include/matrix.h index 65757add..8794f0b8 100644 --- a/src/tests/include/matrix.h +++ b/src/tests/include/matrix.h @@ -310,21 +310,21 @@ compareMatrices( const cl_double *absDelta = NULL) { size_t m = 0, n = 0; - T a, b; + T ref, clresult; cl_double delta; if( lda > 0 ) // General case { for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { delta = absDelta[m * N + n]; } - if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(a, b, delta); + if( module(ref-clresult) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(ref, clresult, delta); } } } @@ -336,14 +336,14 @@ compareMatrices( { for( m=n; m < M; m++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } - if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(a, b, delta); + if( module(ref-clresult) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(ref, clresult, delta); } } } @@ -353,14 +353,14 @@ compareMatrices( { for( n = 0; n <= m; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } - if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(a, b, delta); + if( module(ref-clresult) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(ref, clresult, delta); } } } @@ -379,23 +379,23 @@ compareMatrices( const cl_double *absDelta) { size_t m = 0, n = 0; - FloatComplex a, b; + FloatComplex ref, clresult; cl_double delta; if ( lda > 0 ) { for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { delta = absDelta[m * N + n]; } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + if( (module(CREAL(ref) - CREAL(clresult)) > delta) || (module(CIMAG(ref) - CIMAG(clresult)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + ASSERT_NEAR(CREAL(ref), CREAL(clresult), delta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), delta); } } } @@ -407,16 +407,16 @@ if ( lda > 0 ) { for( m=n; m < M; m++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + if( (module(CREAL(ref) - CREAL(clresult)) > delta) || (module(CIMAG(ref) - CIMAG(clresult)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + ASSERT_NEAR(CREAL(ref), CREAL(clresult), delta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), delta); } } } @@ -426,16 +426,16 @@ if ( lda > 0 ) { for( n = 0; n <= m; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + if( (module(CREAL(ref) - CREAL(clresult)) > delta) || (module(CIMAG(ref) - CIMAG(clresult)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + ASSERT_NEAR(CREAL(ref), CREAL(clresult), delta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), delta); } } } @@ -455,22 +455,22 @@ compareMatrices( const cl_double *absDelta) { size_t m = 0, n = 0; - DoubleComplex a, b; + DoubleComplex ref, clresult; cl_double delta; if( lda > 0 ) { for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { delta = absDelta[m * N + n]; } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + if( (module(CREAL(ref) - CREAL(clresult)) > delta) || (module(CIMAG(ref) - CIMAG(clresult)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + ASSERT_NEAR(CREAL(ref), CREAL(clresult), delta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), delta); } } } @@ -482,16 +482,16 @@ if( lda > 0 ) { for( m=n; m < M; m++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + if( (module(CREAL(ref) - CREAL(clresult)) > delta) || (module(CIMAG(ref) - CIMAG(clresult)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + ASSERT_NEAR(CREAL(ref), CREAL(clresult), delta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), delta); } } } @@ -501,16 +501,16 @@ if( lda > 0 ) { for( n = 0; n <= m; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + if( (module(CREAL(ref) - CREAL(clresult)) > delta) || (module(CIMAG(ref) - CIMAG(clresult)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + ASSERT_NEAR(CREAL(ref), CREAL(clresult), delta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), delta); } } } @@ -764,10 +764,10 @@ template static void compareValues( const T *A, const T *B, const cl_double absDelta=0.0 ) { - T a, b; - a = *A; - b = *B; - ASSERT_NEAR(a, b, absDelta); + T ref, clresult; + ref = *A; + clresult = *B; + ASSERT_NEAR(ref, clresult, absDelta); } template<> @@ -775,12 +775,12 @@ __template_static void compareValues ( const FloatComplex *A, const FloatComplex *B, const cl_double absDelta ) { - FloatComplex a, b; + FloatComplex ref, clresult; - a = *A; - b = *B; - ASSERT_NEAR(CREAL(a), CREAL(b), absDelta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), absDelta); + ref = *A; + clresult = *B; + ASSERT_NEAR(CREAL(ref), CREAL(clresult), absDelta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), absDelta); } template<> @@ -788,11 +788,11 @@ __template_static void compareValues ( const DoubleComplex *A, const DoubleComplex *B, const cl_double absDelta ) { - DoubleComplex a, b; + DoubleComplex ref, clresult; - a = *A; - b = *B; - ASSERT_NEAR(CREAL(a), CREAL(b), absDelta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), absDelta); + ref = *A; + clresult = *B; + ASSERT_NEAR(CREAL(ref), CREAL(clresult), absDelta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), absDelta); } #endif // MATRIX_H_ From 96cae21788da111fef22603630b711538406709d Mon Sep 17 00:00:00 2001 From: Kent Knox Date: Fri, 29 Apr 2016 13:09:59 -0500 Subject: [PATCH 34/45] Commenting out further #pragma warning messages Changing default for 'keeping kernel sources' to ON to fix unit test failures on CPU devices --- src/library/CMakeLists.txt | 32 +++++++++---------- .../dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp | 2 +- .../dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp | 2 +- .../dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp | 2 +- .../dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp | 2 +- .../dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp | 2 +- .../dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp | 2 +- .../sgemm_Col_NN_B0_MX032_NX032_KX16_src.cpp | 2 +- .../sgemm_Col_NN_B0_MX064_NX064_KX16_src.cpp | 2 +- .../sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp | 2 +- ..._Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp | 2 +- .../sgemm_Col_NN_B1_MX032_NX032_KX16_src.cpp | 4 +-- .../sgemm_Col_NN_B1_MX064_NX064_KX16_src.cpp | 2 +- .../sgemm_Col_NN_B1_MX096_NX096_KX16_src.cpp | 2 +- .../sgemm_Col_NT_B0_MX032_NX032_KX16_src.cpp | 2 +- .../sgemm_Col_NT_B0_MX064_NX064_KX16_src.cpp | 2 +- .../sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp | 2 +- ..._Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp | 2 +- ..._Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp | 2 +- .../sgemm_Col_NT_B1_MX032_NX032_KX16_src.cpp | 2 +- ...emm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp | 2 +- ...emm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp | 2 +- .../sgemm_Col_NT_B1_MX064_NX064_KX16_src.cpp | 2 +- .../sgemm_Col_NT_B1_MX096_NX096_KX16_src.cpp | 2 +- .../sgemm_Col_NT_B1_MX128_NX128_KX16_src.cpp | 6 ++-- .../sgemm_Col_TN_B0_MX032_NX032_KX16_src.cpp | 2 +- .../sgemm_Col_TN_B0_MX064_NX064_KX16_src.cpp | 2 +- .../sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp | 2 +- ..._Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp | 2 +- .../sgemm_Col_TN_B1_MX032_NX032_KX16_src.cpp | 2 +- .../sgemm_Col_TN_B1_MX064_NX064_KX16_src.cpp | 2 +- .../sgemm_Col_TN_B1_MX096_NX096_KX16_src.cpp | 2 +- 32 files changed, 50 insertions(+), 50 deletions(-) diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt index 1925b6c2..d94b6656 100644 --- a/src/library/CMakeLists.txt +++ b/src/library/CMakeLists.txt @@ -1,12 +1,12 @@ # ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -99,7 +99,7 @@ set_property( CACHE AUTOGEMM_ARCHITECTURE PROPERTY STRINGS "Hawaii" "Fiji" ) # opencl compiler version #set( PRECOMPILE_GEMM_OPENCL_VERSION "2.0" CACHE STRING "OpenCL compiler version supported by device driver." ) #set_property( CACHE PRECOMPILE_GEMM_OPENCL_VERSION PROPERTY STRINGS 2.0 1.2 1.1 ) -#message( STATUS "AutoGemm PreCompiler will use OpenCL ${PRECOMPILE_GEMM_OPENCL_VERSION} compiler." ) +#message( STATUS "AutoGemm PreCompiler will use OpenCL ${PRECOMPILE_GEMM_OPENCL_VERSION} compiler." ) # PreCompile precision selected? set( PRECOMPILE_GEMM_PRECISION_SELECTED OFF) @@ -554,8 +554,8 @@ set(SRC_CL_TEMPLATES_GEN dtrsm_gpu192.clHawaii_64.bin.cl dgemm_hawaiiChannelConfilct.clHawaii_64.bin.cl dgemm_hawaiiSplitKernel.clHawaii_64.bin.cl - sgemm_hawaiiSplitKernel.clHawaii_64.bin.cl - sgemm_hawaiiSplitKernel.clBonaire_64.bin.cl + sgemm_hawaiiSplitKernel.clHawaii_64.bin.cl + sgemm_hawaiiSplitKernel.clBonaire_64.bin.cl dgemm_hawai.clTahiti_64.bin.cl dtrsm_gpu.clTahiti_64.bin.cl dgemm_gcn_SmallMatrices.clHawaii_64.bin.cl @@ -628,7 +628,7 @@ set(CLBLAS_SOURCES ) set(GLOBAL_HEADERS ${clBLAS_SOURCE_DIR}/clBLAS.h - ${clBLAS_SOURCE_DIR}/clBLAS-complex.h + ${clBLAS_SOURCE_DIR}/clBLAS-complex.h ${clBLAS_SOURCE_DIR}/include/clkern.h ${clBLAS_SOURCE_DIR}/include/cltypes.h ${clBLAS_SOURCE_DIR}/include/dblock_kgen.h @@ -673,7 +673,7 @@ if( BLAS_DUMP_CLBLAS_KERNELS ) add_definitions( -DDUMP_CLBLAS_KERNELS ) endif() -option( BLAS_KEEP_KERNEL_SOURCES "Prevent the library from stripping source from kernels" OFF ) +option( BLAS_KEEP_KERNEL_SOURCES "Prevent the library from stripping source from kernels" ON ) if( BLAS_KEEP_KERNEL_SOURCES ) add_definitions( -DKEEP_CLBLAS_KERNEL_SOURCES ) endif() @@ -723,7 +723,7 @@ add_custom_command(TARGET OCLBinaryGenerator_GEN PRE_BUILD COMMAND ${CMAKE_COMMAND} -DOCLBinaryGeneratorBinaryDir=${OCLBinaryGeneratorBinaryDir} -DSOURCE_DIR=${CMAKE_SOURCE_DIR} -DBINARY_DIR=${CMAKE_BINARY_DIR} -DOCL_COMPILER_FLAGS=${OCL_COMPILER_FLAGS} -P "${CMAKE_SOURCE_DIR}/library/OCLBinaryGenerator.cmake" - ) + ) add_dependencies( OCLBinaryGenerator_GEN OCLBinaryGenerator ) endif() @@ -759,7 +759,7 @@ message(STATUS "OPENCL_VERSION = ${OPENCL_VERSION}") # list(GET OPENCL_FLAGS_REPLACED 1 OPENCL_FLAGS_REPLACED_1)#flags for TAHITI # list(GET OPENCL_FLAGS_REPLACED 3 OPENCL_FLAGS_REPLACED_3)#flags for HAWAII 1 # list(GET OPENCL_FLAGS_REPLACED 5 OPENCL_FLAGS_REPLACED_5)#flags for HAWAII 2 -# list(GET OPENCL_FLAGS_REPLACED 7 OPENCL_FLAGS_REPLACED_7)#flags for BONAIRE +# list(GET OPENCL_FLAGS_REPLACED 7 OPENCL_FLAGS_REPLACED_7)#flags for BONAIRE # else() # MESSAGE(STATUS "flags.txt not found. will use the default flags.") # set (LOAD_CL_FLAGS FALSE) @@ -787,19 +787,19 @@ if (LOAD_CL_FLAGS) add_custom_target( GEN_CLBIN ) add_custom_command(TARGET GEN_CLBIN PRE_BUILD - COMMAND ${CMAKE_COMMAND} -DbingenBinaryDir=${bingenBinaryDir} -DCLTEMPLATE_PATH="${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates" - -DLOAD_CL_FLAGS=${LOAD_CL_FLAGS} -DTAHITI_FLAG=${OPENCL_FLAGS_REPLACED_1} -DHAWAII1_FLAG=${OPENCL_FLAGS_REPLACED_3} -DHAWAII2_FLAG=${OPENCL_FLAGS_REPLACED_5} -DBONAIRE_FLAG=${OPENCL_FLAGS_REPLACED_7} - -DENV_PATH=${ENV_PATH} -DOPENCL_OFFLINE_BUILD_HAWAII_KERNEL=${OPENCL_OFFLINE_BUILD_HAWAII_KERNEL} -DOPENCL_OFFLINE_BUILD_BONAIRE_KERNEL=${OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL} + COMMAND ${CMAKE_COMMAND} -DbingenBinaryDir=${bingenBinaryDir} -DCLTEMPLATE_PATH="${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates" + -DLOAD_CL_FLAGS=${LOAD_CL_FLAGS} -DTAHITI_FLAG=${OPENCL_FLAGS_REPLACED_1} -DHAWAII1_FLAG=${OPENCL_FLAGS_REPLACED_3} -DHAWAII2_FLAG=${OPENCL_FLAGS_REPLACED_5} -DBONAIRE_FLAG=${OPENCL_FLAGS_REPLACED_7} + -DENV_PATH=${ENV_PATH} -DOPENCL_OFFLINE_BUILD_HAWAII_KERNEL=${OPENCL_OFFLINE_BUILD_HAWAII_KERNEL} -DOPENCL_OFFLINE_BUILD_BONAIRE_KERNEL=${OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL} -DOPENCL_OFFLINE_BUILD_TAHITI_KERNEL=${OPENCL_OFFLINE_BUILD_TAHITI_KERNEL} -P "${CMAKE_SOURCE_DIR}/library/bingen.cmake" - ) + ) add_dependencies( GEN_CLBIN bingen ) else() add_custom_target( GEN_CLBIN ) add_custom_command(TARGET GEN_CLBIN PRE_BUILD - COMMAND ${CMAKE_COMMAND} -DbingenBinaryDir=${bingenBinaryDir} -DCLTEMPLATE_PATH="${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates" - -DOPENCL_OFFLINE_BUILD_HAWAII_KERNEL=${OPENCL_OFFLINE_BUILD_HAWAII_KERNEL} -DOPENCL_OFFLINE_BUILD_BONAIRE_KERNEL=${OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL} + COMMAND ${CMAKE_COMMAND} -DbingenBinaryDir=${bingenBinaryDir} -DCLTEMPLATE_PATH="${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates" + -DOPENCL_OFFLINE_BUILD_HAWAII_KERNEL=${OPENCL_OFFLINE_BUILD_HAWAII_KERNEL} -DOPENCL_OFFLINE_BUILD_BONAIRE_KERNEL=${OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL} -DOPENCL_OFFLINE_BUILD_TAHITI_KERNEL=${OPENCL_OFFLINE_BUILD_TAHITI_KERNEL} -P "${CMAKE_SOURCE_DIR}/library/bingen.cmake" ) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp index 04c07e8a..4eb654d2 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_DGEMM_COL_NN_B0_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_NN_B0_MX048_NX048_KX08_SRC_H -#pragma message("AutoGemm's dgemm_Col_NN_B0_MX048_NX048_KX08_src overriden by user.") +// #pragma message("AutoGemm's dgemm_Col_NN_B0_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp index 05417daa..0fbde4a5 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_DGEMM_COL_NN_B1_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_NN_B1_MX048_NX048_KX08_SRC_H -#pragma message("AutoGemm's dgemm_Col_NN_B1_MX048_NX048_KX08_src overriden by user.") +// #pragma message("AutoGemm's dgemm_Col_NN_B1_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp index ffe879af..d35d8140 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_DGEMM_COL_NT_B0_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_NT_B0_MX048_NX048_KX08_SRC_H -#pragma message("AutoGemm's dgemm_Col_NT_B0_MX048_NX048_KX08_src overriden by user.") +// #pragma message("AutoGemm's dgemm_Col_NT_B0_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp index 5af48fcb..e13eda7f 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_DGEMM_COL_NT_B1_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_NT_B1_MX048_NX048_KX08_SRC_H -#pragma message("AutoGemm's dgemm_Col_NT_B1_MX048_NX048_KX08_src overriden by user.") +// #pragma message("AutoGemm's dgemm_Col_NT_B1_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp index 1bed066f..e9710aec 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_DGEMM_COL_TN_B0_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_TN_B0_MX048_NX048_KX08_SRC_H -#pragma message("AutoGemm's dgemm_Col_TN_B0_MX048_NX048_KX08_src overriden by user.") +// #pragma message("AutoGemm's dgemm_Col_TN_B0_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp index 927952b2..43429334 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_DGEMM_COL_TN_B1_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_TN_B1_MX048_NX048_KX08_SRC_H -#pragma message("AutoGemm's dgemm_Col_TN_B1_MX048_NX048_KX08_src overriden by user.") +// #pragma message("AutoGemm's dgemm_Col_TN_B1_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX032_NX032_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX032_NX032_KX16_src.cpp index b8ba4e85..033f9269 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX032_NX032_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX032_NX032_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B0_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B0_MX032_NX032_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B0_MX032_NX032_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B0_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX064_NX064_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX064_NX064_KX16_src.cpp index be06d446..99813f33 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX064_NX064_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX064_NX064_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B0_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B0_MX064_NX064_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B0_MX064_NX064_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B0_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp index c1f92569..ef8a648b 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B0_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B0_MX096_NX096_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B0_MX096_NX096_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B0_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp index cc90ff90..c666ed5c 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B1_MX032_NX032_KX16_BRANCH_SRC_H #define KERNEL_SGEMM_COL_NN_B1_MX032_NX032_KX16_BRANCH_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src (if exists) overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src (if exists) overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_src.cpp index b2f8306f..9c0eb191 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B1_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B1_MX032_NX032_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B1_MX032_NX032_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B1_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) @@ -79,7 +79,7 @@ __kernel void sgemm_Col_NN_B1_MX032_NX032_KX16 ( __local float* plA = lA + idyT*33+idxT; __local float* plB = lB + idxT*33+idyT; - + barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX064_NX064_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX064_NX064_KX16_src.cpp index 80aeceaa..44b5acfb 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX064_NX064_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX064_NX064_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B1_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B1_MX064_NX064_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B1_MX064_NX064_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B1_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX096_NX096_KX16_src.cpp index 26e354b5..2bdefd7c 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX096_NX096_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B1_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B1_MX096_NX096_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B1_MX096_NX096_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B1_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX032_NX032_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX032_NX032_KX16_src.cpp index d513c81b..fec4f08c 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX032_NX032_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX032_NX032_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B0_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B0_MX032_NX032_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B0_MX032_NX032_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B0_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX064_NX064_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX064_NX064_KX16_src.cpp index 11558a03..e0437cf7 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX064_NX064_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX064_NX064_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B0_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B0_MX064_NX064_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B0_MX064_NX064_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B0_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp index a8d0fec1..7f66ea07 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B0_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B0_MX096_NX096_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B0_MX096_NX096_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B0_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp index f26ddece..1e8e76a9 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_BRANCH_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_BRANCH_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src (if exists) overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src (if exists) overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp index a01958f1..7ca44c5c 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp @@ -6,7 +6,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_SINGLE_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_SINGLE_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src (if exists) overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src (if exists) overriden by user.") #include "UserGemmKernelSourceIncludes.h" #ifndef STRINGIFY diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_src.cpp index 1a2ca972..1c2974a8 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp index ae477cbe..ebc42b61 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp @@ -6,7 +6,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX032_NX064_KX16_ROW_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX032_NX064_KX16_ROW_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src (if exists) overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src (if exists) overriden by user.") #include "UserGemmKernelSourceIncludes.h" diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp index 3d39977e..b0213c93 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp @@ -6,7 +6,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX064_NX032_KX16_COLUMN_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX064_NX032_KX16_COLUMN_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_src (if exists) overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_src (if exists) overriden by user.") #include "UserGemmKernelSourceIncludes.h" #ifndef STRINGIFY diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX064_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX064_KX16_src.cpp index d84d4ecd..e22d616b 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX064_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX064_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX064_NX064_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX064_NX064_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX096_NX096_KX16_src.cpp index 7e4401db..9a014e05 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX096_NX096_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX096_NX096_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX096_NX096_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX096_NX096_KX16_src overriden by user.") #include "UserGemmKernelSourceIncludes.h" diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX128_NX128_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX128_NX128_KX16_src.cpp index 4c5ceb4d..ecc4d5f6 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX128_NX128_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX128_NX128_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX128_NX128_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX128_NX128_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX128_NX128_KX16_src (if exists) overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX128_NX128_KX16_src (if exists) overriden by user.") #include "UserGemmKernelSourceIncludes.h" @@ -53,7 +53,7 @@ const char * const sgemm_Col_NT_B1_MX128_NX128_KX16_src = STRINGIFY( rC[4][0] = mad(rA[0][4],rB[0][0],rC[4][0]); \ rC[5][0] = mad(rA[0][5],rB[0][0],rC[5][0]); \ rC[6][0] = mad(rA[0][6],rB[0][0],rC[6][0]); \ - rC[7][0] = mad(rA[0][7],rB[0][0],rC[7][0]); \ + rC[7][0] = mad(rA[0][7],rB[0][0],rC[7][0]); \ rC[0][1] = mad(rA[0][0], rB[0][1], rC[0][1]); \ rC[1][1] = mad(rA[0][1], rB[0][1], rC[1][1]); \ rC[2][1] = mad(rA[0][2], rB[0][1], rC[2][1]); \ @@ -284,7 +284,7 @@ uint offsetC) C[80 * ldc] = alpha*rC[7][5] + beta*C[80 * ldc]; C[96 * ldc] = alpha*rC[7][6] + beta*C[96 * ldc]; C[112 * ldc] = alpha*rC[7][7] + beta*C[112 * ldc]; - + } ); #endif diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX032_NX032_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX032_NX032_KX16_src.cpp index 5722f9ee..cf1f406b 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX032_NX032_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX032_NX032_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B0_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B0_MX032_NX032_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B0_MX032_NX032_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B0_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX064_NX064_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX064_NX064_KX16_src.cpp index fd80cb99..2dfd586f 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX064_NX064_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX064_NX064_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B0_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B0_MX064_NX064_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B0_MX064_NX064_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B0_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp index 48323fc3..ccf23bd7 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B0_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B0_MX096_NX096_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B0_MX096_NX096_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B0_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp index a41a09ef..00e3e661 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B1_MX032_NX032_KX16_BRANCH_SRC_H #define KERNEL_SGEMM_COL_TN_B1_MX032_NX032_KX16_BRANCH_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_src.cpp index 1b435748..5f2ed47c 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B1_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B1_MX032_NX032_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B1_MX032_NX032_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B1_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX064_NX064_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX064_NX064_KX16_src.cpp index a678e204..ae198149 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX064_NX064_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX064_NX064_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B1_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B1_MX064_NX064_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B1_MX064_NX064_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B1_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX096_NX096_KX16_src.cpp index f15882fa..7985e474 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX096_NX096_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B1_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B1_MX096_NX096_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B1_MX096_NX096_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B1_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) From d20977ec4389c6b3751e318779410007c5e272f8 Mon Sep 17 00:00:00 2001 From: tfauck Date: Thu, 5 May 2016 15:28:30 +0200 Subject: [PATCH 35/45] Support for altivec on powerpc64 P8 systems (#262) Support for altivec on powerpc64 P8 systems :shipit: Thank you for your contribution --- src/include/defbool.h | 5 +++ src/library/blas/functor/functor_xscal.cc | 2 +- src/library/blas/generic/solution_seq_make.c | 4 +- src/library/blas/gens/asum.cpp | 6 +-- src/library/blas/gens/axpy_reg.cpp | 8 ++-- src/library/blas/gens/copy_reg.cpp | 8 ++-- src/library/blas/gens/dot.cpp | 8 ++-- src/library/blas/gens/gbmv.cpp | 4 +- src/library/blas/gens/gemv.c | 10 ++--- src/library/blas/gens/ger_lds.cpp | 4 +- src/library/blas/gens/her2_lds.cpp | 4 +- src/library/blas/gens/her_lds.cpp | 2 +- src/library/blas/gens/iamax.cpp | 6 +-- src/library/blas/gens/nrm2.cpp | 6 +-- src/library/blas/gens/rotm_reg.cpp | 8 ++-- src/library/blas/gens/scal_reg.cpp | 4 +- src/library/blas/gens/swap_reg.cpp | 8 ++-- src/library/blas/gens/symv.c | 10 ++--- src/library/blas/gens/syr2_lds.cpp | 4 +- src/library/blas/gens/syr_lds.cpp | 2 +- src/library/blas/gens/trmv_reg.cpp | 4 +- src/library/blas/gens/trsv_gemv.cpp | 2 +- src/library/blas/gens/trsv_trtri.cpp | 2 +- src/library/blas/include/clblas-internal.h | 2 +- src/library/blas/ixamax.c | 2 +- src/library/blas/xasum.c | 2 +- src/library/blas/xaxpy.c | 4 +- src/library/blas/xcopy.c | 4 +- src/library/blas/xdot.c | 4 +- src/library/blas/xgbmv.c | 4 +- src/library/blas/xgemv.c | 4 +- src/library/blas/xger.c | 4 +- src/library/blas/xhemv.c | 4 +- src/library/blas/xher.c | 2 +- src/library/blas/xher2.c | 8 ++-- src/library/blas/xhpmv.c | 4 +- src/library/blas/xnrm2.c | 2 +- src/library/blas/xrot.c | 4 +- src/library/blas/xrotm.c | 4 +- src/library/blas/xscal.c | 2 +- src/library/blas/xshbmv.c | 4 +- src/library/blas/xspmv.c | 4 +- src/library/blas/xswap.c | 4 +- src/library/blas/xsymv.c | 4 +- src/library/blas/xsyr.c | 2 +- src/library/blas/xsyr2.c | 4 +- src/library/blas/xtbmv.c | 4 +- src/library/blas/xtbsv.c | 44 ++++++++++---------- src/library/blas/xtrmv.c | 4 +- src/library/blas/xtrsv.c | 2 +- src/library/tools/ktest/config.cpp | 4 +- src/library/tools/ktest/steps/gemv.cpp | 12 +++--- src/library/tools/ktest/steps/symv.cpp | 12 +++--- 53 files changed, 145 insertions(+), 140 deletions(-) diff --git a/src/include/defbool.h b/src/include/defbool.h index 26caf6af..741e6c59 100644 --- a/src/include/defbool.h +++ b/src/include/defbool.h @@ -18,6 +18,11 @@ #ifndef DEFBOOL_H_ #define DEFBOOL_H_ +#if defined(__powerpc64__) && defined(__ALTIVEC__) +#include "altivec.h" +#undef bool +#endif + #if defined(_MSC_VER) && _MSC_VER <= 1700 /* diff --git a/src/library/blas/functor/functor_xscal.cc b/src/library/blas/functor/functor_xscal.cc index 1615499c..c19d2688 100644 --- a/src/library/blas/functor/functor_xscal.cc +++ b/src/library/blas/functor/functor_xscal.cc @@ -81,7 +81,7 @@ doScal( kargs->N = N; kargs->A = X; kargs->offBX = offx; - kargs->ldb.vector = incx; // Will be using this as incx + kargs->ldb.Vector = incx; // Will be using this as incx if(incx < 0) { // According to Netlib - return for negative incx return clblasSuccess; diff --git a/src/library/blas/generic/solution_seq_make.c b/src/library/blas/generic/solution_seq_make.c index b92e18ec..ab64869a 100644 --- a/src/library/blas/generic/solution_seq_make.c +++ b/src/library/blas/generic/solution_seq_make.c @@ -709,10 +709,10 @@ clblasArgsToKextraFlags(const CLBlasKargs *args, BlasFunctionID funcID) } } if (funcID == CLBLAS_GEMV || funcID == CLBLAS_SYMV) { - if (args->ldb.vector == 1) { + if (args->ldb.Vector == 1) { flags |= KEXTRA_INCX_ONE; } - if (args->ldc.vector == 1) { + if (args->ldc.Vector == 1) { flags |= KEXTRA_INCY_ONE; } } diff --git a/src/library/blas/gens/asum.cpp b/src/library/blas/gens/asum.cpp index 06b9f544..8c377cc1 100644 --- a/src/library/blas/gens/asum.cpp +++ b/src/library/blas/gens/asum.cpp @@ -137,10 +137,10 @@ setBuildOpts( printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } - if( (kargs->ldb.vector) != 1) { + if( (kargs->ldb.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } - if( (kargs->ldb.vector) < 1) { + if( (kargs->ldb.Vector) < 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NEGATIVE"); } return; @@ -275,7 +275,7 @@ assignKargs(KernelArg *args, const void *params, const void* ) INIT_KARG(&args[1], blasArgs->D); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); - incx = blasArgs->ldb.vector; + incx = blasArgs->ldb.Vector; INIT_KARG(&args[4], incx); return; } diff --git a/src/library/blas/gens/axpy_reg.cpp b/src/library/blas/gens/axpy_reg.cpp index 52aab71f..fc73cbda 100644 --- a/src/library/blas/gens/axpy_reg.cpp +++ b/src/library/blas/gens/axpy_reg.cpp @@ -130,10 +130,10 @@ setBuildOpts( printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } - if( (kargs->ldb.vector) != 1) { + if( (kargs->ldb.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } - if( (kargs->ldc.vector) != 1) { + if( (kargs->ldc.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY"); } @@ -269,10 +269,10 @@ assignKargs(KernelArg *args, const void *params, const void* ) INIT_KARG(&args[2], blasArgs->B); initSizeKarg(&args[3], blasArgs->N); initSizeKarg(&args[4], blasArgs->offBX); - incx = blasArgs->ldb.vector; + incx = blasArgs->ldb.Vector; INIT_KARG(&args[5], incx); initSizeKarg(&args[6], blasArgs->offCY); - incy = blasArgs->ldc.vector; + incy = blasArgs->ldc.Vector; INIT_KARG(&args[7], incy); return; diff --git a/src/library/blas/gens/copy_reg.cpp b/src/library/blas/gens/copy_reg.cpp index ba1ff398..28df2ee1 100644 --- a/src/library/blas/gens/copy_reg.cpp +++ b/src/library/blas/gens/copy_reg.cpp @@ -130,10 +130,10 @@ setBuildOpts( printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } - if( (kargs->ldb.vector) != 1) { + if( (kargs->ldb.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } - if( (kargs->ldc.vector) != 1) { + if( (kargs->ldc.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY"); } @@ -264,10 +264,10 @@ assignKargs(KernelArg *args, const void *params, const void* ) INIT_KARG(&args[1], blasArgs->B); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); - incx = blasArgs->ldb.vector; + incx = blasArgs->ldb.Vector; INIT_KARG(&args[4], incx); initSizeKarg(&args[5], blasArgs->offCY); - incy = blasArgs->ldc.vector; + incy = blasArgs->ldc.Vector; INIT_KARG(&args[6], incy); return; diff --git a/src/library/blas/gens/dot.cpp b/src/library/blas/gens/dot.cpp index ed3e72b8..ce766cbd 100644 --- a/src/library/blas/gens/dot.cpp +++ b/src/library/blas/gens/dot.cpp @@ -133,10 +133,10 @@ setBuildOpts( printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } - if( (kargs->ldb.vector) != 1) { + if( (kargs->ldb.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } - if( (kargs->ldc.vector) != 1) { + if( (kargs->ldc.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY"); } @@ -272,10 +272,10 @@ assignKargs(KernelArg *args, const void *params, const void* ) INIT_KARG(&args[2], blasArgs->D); initSizeKarg(&args[3], blasArgs->N); initSizeKarg(&args[4], blasArgs->offBX); - incx = blasArgs->ldb.vector; + incx = blasArgs->ldb.Vector; INIT_KARG(&args[5], incx); initSizeKarg(&args[6], blasArgs->offCY); - incy = blasArgs->ldc.vector; + incy = blasArgs->ldc.Vector; INIT_KARG(&args[7], incy); doConj = blasArgs->K; INIT_KARG(&args[8], doConj); diff --git a/src/library/blas/gens/gbmv.cpp b/src/library/blas/gens/gbmv.cpp index ab8e5e2a..ad63a621 100644 --- a/src/library/blas/gens/gbmv.cpp +++ b/src/library/blas/gens/gbmv.cpp @@ -389,9 +389,9 @@ assignKargs(KernelArg *args, const void *params, const void* ) initSizeKarg(&args[6], fKU); initSizeKarg(&args[7], blasArgs->lda.matrix); - inc = blasArgs->ldb.vector; + inc = blasArgs->ldb.Vector; INIT_KARG(&args[8], inc); - inc = blasArgs->ldc.vector; + inc = blasArgs->ldc.Vector; INIT_KARG(&args[9], inc); initSizeKarg(&args[10], blasArgs->offa); diff --git a/src/library/blas/gens/gemv.c b/src/library/blas/gens/gemv.c index 40293d8b..9835482f 100644 --- a/src/library/blas/gens/gemv.c +++ b/src/library/blas/gens/gemv.c @@ -434,12 +434,12 @@ assignKargs(KernelArg *args, const void *params, const void *extra) initSizeKarg(&args[i++], blasArgs->offCY); } if (!(kflags & KEXTRA_INCX_ONE)) { - inc = blasArgs->ldb.vector; + inc = blasArgs->ldb.Vector; INIT_KARG(&args[i], inc); i++; } if (!(kflags & KEXTRA_INCY_ONE)) { - inc = blasArgs->ldc.vector; + inc = blasArgs->ldc.Vector; INIT_KARG(&args[i], inc); i++; } @@ -479,12 +479,12 @@ fixupArgs(void *args, SubproblemDim *subdims, void *extra) else { kargs->offA += off * kargs->lda.matrix; } - if (kargs->ldc.vector < 0) { + if (kargs->ldc.Vector < 0) { // K store the original height of the matrix A - kargs->offCY += (kargs->K - off) * abs(kargs->ldc.vector); + kargs->offCY += (kargs->K - off) * abs(kargs->ldc.Vector); } else { - kargs->offCY += off * kargs->ldc.vector; + kargs->offCY += off * kargs->ldc.Vector; } } diff --git a/src/library/blas/gens/ger_lds.cpp b/src/library/blas/gens/ger_lds.cpp index f72d1975..267c979e 100644 --- a/src/library/blas/gens/ger_lds.cpp +++ b/src/library/blas/gens/ger_lds.cpp @@ -317,8 +317,8 @@ assignKargs(KernelArg *args, const void *params, const void*) initSizeKarg(&args[3], blasArgs->M); initSizeKarg(&args[4], blasArgs->N); - incx = blasArgs->ldb.vector; - incy = blasArgs->ldc.vector; + incx = blasArgs->ldb.Vector; + incy = blasArgs->ldc.Vector; initSizeKarg(&args[5], blasArgs->offBX); INIT_KARG(&args[6], incx); initSizeKarg(&args[7], blasArgs->offCY); diff --git a/src/library/blas/gens/her2_lds.cpp b/src/library/blas/gens/her2_lds.cpp index 5adda19d..3fd2fd0d 100644 --- a/src/library/blas/gens/her2_lds.cpp +++ b/src/library/blas/gens/her2_lds.cpp @@ -332,10 +332,10 @@ assignKargs(KernelArg *args, const void *params, const void*) INIT_KARG(&args[2], blasArgs->C); //Y - y vector initSizeKarg(&args[3], blasArgs->N); initSizeKarg(&args[4], blasArgs->offBX); - inc = blasArgs->ldb.vector; + inc = blasArgs->ldb.Vector; INIT_KARG(&args[5], inc); initSizeKarg(&args[6], blasArgs->offCY); - inc = blasArgs->ldc.vector; + inc = blasArgs->ldc.Vector; INIT_KARG(&args[7], inc); initSizeKarg(&args[8], blasArgs->offa); initSizeKarg(&args[9], blasArgs->lda.matrix); diff --git a/src/library/blas/gens/her_lds.cpp b/src/library/blas/gens/her_lds.cpp index 1a8365f0..8748645b 100644 --- a/src/library/blas/gens/her_lds.cpp +++ b/src/library/blas/gens/her_lds.cpp @@ -330,7 +330,7 @@ assignKargs(KernelArg *args, const void *params, const void*) INIT_KARG(&args[1], blasArgs->B); //x - x vector initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); - incx = blasArgs->ldb.vector; + incx = blasArgs->ldb.Vector; INIT_KARG(&args[4], incx); initSizeKarg(&args[5], blasArgs->offa); initSizeKarg(&args[6], blasArgs->lda.matrix); diff --git a/src/library/blas/gens/iamax.cpp b/src/library/blas/gens/iamax.cpp index 7a5966de..13928a8c 100644 --- a/src/library/blas/gens/iamax.cpp +++ b/src/library/blas/gens/iamax.cpp @@ -130,12 +130,12 @@ setBuildOpts( #endif } - if( (kargs->ldb.vector) != 1) + if( (kargs->ldb.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } - if( (kargs->ldb.vector) < 1) + if( (kargs->ldb.Vector) < 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DRETURN_ON_INVALID"); } @@ -277,7 +277,7 @@ assignKargs(KernelArg *args, const void *params, const void* ) INIT_KARG(&args[1], blasArgs->D); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offb); - incx = blasArgs->ldb.vector; + incx = blasArgs->ldb.Vector; INIT_KARG(&args[4], incx); return; diff --git a/src/library/blas/gens/nrm2.cpp b/src/library/blas/gens/nrm2.cpp index d898ffbc..c2dfe91a 100644 --- a/src/library/blas/gens/nrm2.cpp +++ b/src/library/blas/gens/nrm2.cpp @@ -139,10 +139,10 @@ setBuildOpts( addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUSE_SSQ"); } - if( (kargs->ldb.vector) != 1) { + if( (kargs->ldb.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } - if( (kargs->ldb.vector) < 1) { + if( (kargs->ldb.Vector) < 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DRETURN_ON_INVALID"); } return; @@ -269,7 +269,7 @@ assignKargs(KernelArg *args, const void *params, const void* ) INIT_KARG(&args[1], blasArgs->D); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); - incx = blasArgs->ldb.vector; + incx = blasArgs->ldb.Vector; INIT_KARG(&args[4], incx); return; diff --git a/src/library/blas/gens/rotm_reg.cpp b/src/library/blas/gens/rotm_reg.cpp index 2b87507e..5bfdc0fe 100644 --- a/src/library/blas/gens/rotm_reg.cpp +++ b/src/library/blas/gens/rotm_reg.cpp @@ -127,10 +127,10 @@ setBuildOpts( { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_ROT"); } - if( (kargs->ldb.vector) != 1) { + if( (kargs->ldb.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } - if( (kargs->ldc.vector) != 1) { + if( (kargs->ldc.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY"); } @@ -268,10 +268,10 @@ assignKargs(KernelArg *args, const void *params, const void* ) INIT_KARG(&args[1], blasArgs->B); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); - incx = blasArgs->ldb.vector; + incx = blasArgs->ldb.Vector; INIT_KARG(&args[4], incx); initSizeKarg(&args[5], blasArgs->offCY); - incy = blasArgs->ldc.vector; + incy = blasArgs->ldc.Vector; INIT_KARG(&args[6], incy); if(blasArgs->pigFuncID == CLBLAS_ROT) diff --git a/src/library/blas/gens/scal_reg.cpp b/src/library/blas/gens/scal_reg.cpp index 8b853106..732ecb08 100644 --- a/src/library/blas/gens/scal_reg.cpp +++ b/src/library/blas/gens/scal_reg.cpp @@ -130,7 +130,7 @@ setBuildOpts( printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } - if( (kargs->ldb.vector) != 1) { + if( (kargs->ldb.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } @@ -261,7 +261,7 @@ assignKargs(KernelArg *args, const void *params, const void* ) INIT_KARG(&args[1], blasArgs->A); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); - incx = blasArgs->ldb.vector; + incx = blasArgs->ldb.Vector; INIT_KARG(&args[4], incx); return; diff --git a/src/library/blas/gens/swap_reg.cpp b/src/library/blas/gens/swap_reg.cpp index b75e1004..a93cbd0e 100644 --- a/src/library/blas/gens/swap_reg.cpp +++ b/src/library/blas/gens/swap_reg.cpp @@ -130,10 +130,10 @@ setBuildOpts( printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } - if( (kargs->ldb.vector) != 1) { + if( (kargs->ldb.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } - if( (kargs->ldc.vector) != 1) { + if( (kargs->ldc.Vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY"); } @@ -265,10 +265,10 @@ assignKargs(KernelArg *args, const void *params, const void* ) INIT_KARG(&args[1], blasArgs->B); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); - incx = blasArgs->ldb.vector; + incx = blasArgs->ldb.Vector; INIT_KARG(&args[4], incx); initSizeKarg(&args[5], blasArgs->offCY); - incy = blasArgs->ldc.vector; + incy = blasArgs->ldc.Vector; INIT_KARG(&args[6], incy); return; diff --git a/src/library/blas/gens/symv.c b/src/library/blas/gens/symv.c index 49448927..47c8f1d2 100644 --- a/src/library/blas/gens/symv.c +++ b/src/library/blas/gens/symv.c @@ -926,12 +926,12 @@ assignKargs(KernelArg *args, const void *params, const void *extra) initSizeKarg(&args[i++], blasArgs->offCY); } if (!(kflags & KEXTRA_INCX_ONE)) { - inc = blasArgs->ldb.vector; + inc = blasArgs->ldb.Vector; INIT_KARG(&args[i], inc); i++; } if (!(kflags & KEXTRA_INCY_ONE)) { - inc = blasArgs->ldc.vector; + inc = blasArgs->ldc.Vector; INIT_KARG(&args[i], inc); i++; } @@ -949,13 +949,13 @@ fixupArgs(void *args, SubproblemDim *subdims, void *extra) (void)subdims; if (kargs->offsetN) { - if (kargs->ldc.vector < 0) { + if (kargs->ldc.Vector < 0) { // K store the original height of the matrix A kargs->offCY += (kargs->K - kargs->offsetN) * - abs(kargs->ldc.vector); + abs(kargs->ldc.Vector); } else { - kargs->offCY += kargs->offsetN * kargs->ldc.vector; + kargs->offCY += kargs->offsetN * kargs->ldc.Vector; } } } diff --git a/src/library/blas/gens/syr2_lds.cpp b/src/library/blas/gens/syr2_lds.cpp index f5c20cb1..6e7df1dd 100644 --- a/src/library/blas/gens/syr2_lds.cpp +++ b/src/library/blas/gens/syr2_lds.cpp @@ -338,10 +338,10 @@ assignKargs(KernelArg *args, const void *params, const void*) INIT_KARG(&args[2], blasArgs->C); //Y - y vector initSizeKarg(&args[3], blasArgs->N); initSizeKarg(&args[4], blasArgs->offBX); - inc = blasArgs->ldb.vector; + inc = blasArgs->ldb.Vector; INIT_KARG(&args[5], inc); initSizeKarg(&args[6], blasArgs->offCY); - inc = blasArgs->ldc.vector; + inc = blasArgs->ldc.Vector; INIT_KARG(&args[7], inc); initSizeKarg(&args[8], blasArgs->offa); initSizeKarg(&args[9], blasArgs->lda.matrix); diff --git a/src/library/blas/gens/syr_lds.cpp b/src/library/blas/gens/syr_lds.cpp index 16911bb4..15e56ef1 100644 --- a/src/library/blas/gens/syr_lds.cpp +++ b/src/library/blas/gens/syr_lds.cpp @@ -337,7 +337,7 @@ assignKargs(KernelArg *args, const void *params, const void*) INIT_KARG(&args[1], blasArgs->B); //x - x vector initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); - inc = blasArgs->ldb.vector; + inc = blasArgs->ldb.Vector; INIT_KARG(&args[4], inc); initSizeKarg(&args[5], blasArgs->offA); initSizeKarg(&args[6], blasArgs->lda.matrix); diff --git a/src/library/blas/gens/trmv_reg.cpp b/src/library/blas/gens/trmv_reg.cpp index 9cacd0f1..0adcd22c 100644 --- a/src/library/blas/gens/trmv_reg.cpp +++ b/src/library/blas/gens/trmv_reg.cpp @@ -426,7 +426,7 @@ assignKargs(KernelArg *args, const void *params, const void* ) INIT_KARG(&args[2], blasArgs->C); //y - scratch == _x_vector argument } initSizeKarg(&args[3], blasArgs->N); - inc = blasArgs->ldb.vector; + inc = blasArgs->ldb.Vector; INIT_KARG(&args[4], inc); unity = (blasArgs->diag == clblasUnit); INIT_KARG(&args[5], unity); @@ -444,7 +444,7 @@ assignKargs(KernelArg *args, const void *params, const void* ) // For HEMV both alpha and beta has to be passed. if( (step->funcID == CLBLAS_HEMV) || (blasArgs->pigFuncID == CLBLAS_HPMV) || (blasArgs->pigFuncID == CLBLAS_SPMV) ) { - inc = blasArgs->ldc.vector; + inc = blasArgs->ldc.Vector; INIT_KARG(&args[10], inc); initSizeKarg(&args[11], blasArgs->offCY); assignScalarKarg(&args[12], &(blasArgs->alpha), blasArgs->dtype); diff --git a/src/library/blas/gens/trsv_gemv.cpp b/src/library/blas/gens/trsv_gemv.cpp index ca73fbe5..65121a1c 100644 --- a/src/library/blas/gens/trsv_gemv.cpp +++ b/src/library/blas/gens/trsv_gemv.cpp @@ -474,7 +474,7 @@ assignKargs(KernelArg *args, const void *params, const void*) INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument INIT_KARG(&args[1], blasArgs->B); //x - result buffer = _xnew argument initSizeKarg(&args[2], blasArgs->N); - inc = blasArgs->ldb.vector; + inc = blasArgs->ldb.Vector; INIT_KARG(&args[3], inc); unity = (blasArgs->diag == clblasUnit); INIT_KARG(&args[4], unity); diff --git a/src/library/blas/gens/trsv_trtri.cpp b/src/library/blas/gens/trsv_trtri.cpp index 0bae0f99..db4c9a0a 100644 --- a/src/library/blas/gens/trsv_trtri.cpp +++ b/src/library/blas/gens/trsv_trtri.cpp @@ -382,7 +382,7 @@ assignKargs(KernelArg *args, const void *params, const void*) INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument INIT_KARG(&args[1], blasArgs->B); //x - result buffer = _xnew argument initSizeKarg(&args[2], blasArgs->N); - inc = blasArgs->ldb.vector; + inc = blasArgs->ldb.Vector; INIT_KARG(&args[3], inc); unity = (blasArgs->diag == clblasUnit); INIT_KARG(&args[4], unity); diff --git a/src/library/blas/include/clblas-internal.h b/src/library/blas/include/clblas-internal.h index e9a2d429..e5508e4b 100644 --- a/src/library/blas/include/clblas-internal.h +++ b/src/library/blas/include/clblas-internal.h @@ -61,7 +61,7 @@ typedef union ArgMultiplier { typedef union LeadingDimention { size_t matrix; /**< Positive ld value for matrixes */ - int vector; /**< Integer offset value for vectors */ + int Vector; /**< Integer offset value for vectors */ } LeadingDimention; typedef enum reductionType { diff --git a/src/library/blas/ixamax.c b/src/library/blas/ixamax.c index 18088b61..43099f6b 100644 --- a/src/library/blas/ixamax.c +++ b/src/library/blas/ixamax.c @@ -108,7 +108,7 @@ doiAmax( kargs->N = N; kargs->B = X; kargs->offb = offx; - kargs->ldb.vector = incx; // Will be using this as incx + kargs->ldb.Vector = incx; // Will be using this as incx if(incx < 1) { // According to netlib, if incx<1, NRM2 will be zero kargs->N = 1; // Makeing it launch only 1 work-group } diff --git a/src/library/blas/xasum.c b/src/library/blas/xasum.c index edc718c2..eafd3d85 100644 --- a/src/library/blas/xasum.c +++ b/src/library/blas/xasum.c @@ -111,7 +111,7 @@ doAsum( kargs->offA = offAsum; kargs->B = X; kargs->offBX = offx; - kargs->ldb.vector = incx; // Will be using this as incx + kargs->ldb.Vector = incx; // Will be using this as incx if(incx <1){ kargs->N = 1; } diff --git a/src/library/blas/xaxpy.c b/src/library/blas/xaxpy.c index 4eb66a39..c5e1e56a 100644 --- a/src/library/blas/xaxpy.c +++ b/src/library/blas/xaxpy.c @@ -103,10 +103,10 @@ doAxpy( kargs->N = N; kargs->A = X; kargs->offBX = offx; - kargs->ldb.vector = incx; // Will be using this as incx + kargs->ldb.Vector = incx; // Will be using this as incx kargs->B = Y; kargs->offCY = offy; - kargs->ldc.vector = incy; // Will be using this as incy + kargs->ldc.Vector = incy; // Will be using this as incy #ifdef DEBUG_AXPY printf("Calling makeSolutionSeq from DoAxpy: AXPY\n"); diff --git a/src/library/blas/xcopy.c b/src/library/blas/xcopy.c index d60ba5ee..5540375e 100644 --- a/src/library/blas/xcopy.c +++ b/src/library/blas/xcopy.c @@ -96,10 +96,10 @@ doCopy( kargs->N = N; kargs->A = X; kargs->offBX = offx; - kargs->ldb.vector = incx; // Will be using this as incx + kargs->ldb.Vector = incx; // Will be using this as incx kargs->B = Y; kargs->offCY = offy; - kargs->ldc.vector = incy; // Will be using this as incy + kargs->ldc.Vector = incy; // Will be using this as incy #ifdef DEBUG_COPY printf("Calling makeSolutionSeq from DoCopy: COPY\n"); diff --git a/src/library/blas/xdot.c b/src/library/blas/xdot.c index ed5530ea..e37d0e00 100644 --- a/src/library/blas/xdot.c +++ b/src/library/blas/xdot.c @@ -119,10 +119,10 @@ doDot( kargs->offa = offDP; kargs->B = X; kargs->offBX = offx; - kargs->ldb.vector = incx; // Will be using this as incx + kargs->ldb.Vector = incx; // Will be using this as incx kargs->C = Y; kargs->offCY = offy; - kargs->ldc.vector = incy; // Will be using this as incy + kargs->ldc.Vector = incy; // Will be using this as incy kargs->D = scratchBuff; kargs->redctnType = REDUCE_BY_SUM; kargs->K = (size_t)doConj; diff --git a/src/library/blas/xgbmv.c b/src/library/blas/xgbmv.c index 205f8ebc..dd5dd545 100644 --- a/src/library/blas/xgbmv.c +++ b/src/library/blas/xgbmv.c @@ -102,10 +102,10 @@ doGbmv( kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->C = y; kargs->offCY = offy; - kargs->ldc.vector = incy; + kargs->ldc.Vector = incy; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GBMV, kargs, numCommandQueues, commandQueues, diff --git a/src/library/blas/xgemv.c b/src/library/blas/xgemv.c index 5999979a..e012918e 100644 --- a/src/library/blas/xgemv.c +++ b/src/library/blas/xgemv.c @@ -85,10 +85,10 @@ doGemv( kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->C = y; kargs->offCY = offy; - kargs->ldc.vector = incy; + kargs->ldc.Vector = incy; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GEMV, kargs, numCommandQueues, commandQueues, diff --git a/src/library/blas/xger.c b/src/library/blas/xger.c index 9f794208..4644bfaa 100644 --- a/src/library/blas/xger.c +++ b/src/library/blas/xger.c @@ -121,10 +121,10 @@ doGer( kargs->lda.matrix = lda; kargs->B = X; kargs->offBX = offx; - kargs->ldb.vector = incx; // Will be using this as incx + kargs->ldb.Vector = incx; // Will be using this as incx kargs->C = Y; kargs->offCY = offy; - kargs->ldc.vector = incy; // Will be using this as incy + kargs->ldc.Vector = incy; // Will be using this as incy kargs->offsetM = 0; kargs->offsetN = 0; kargs->scimage[0] = 0; diff --git a/src/library/blas/xhemv.c b/src/library/blas/xhemv.c index 21011dd7..655b1974 100644 --- a/src/library/blas/xhemv.c +++ b/src/library/blas/xhemv.c @@ -86,10 +86,10 @@ doHemv( kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->C = y; kargs->offCY = offy; - kargs->ldc.vector = incy; + kargs->ldc.Vector = incy; kargs->transA = clblasNoTrans; kargs->diag = clblasNonUnit; diff --git a/src/library/blas/xher.c b/src/library/blas/xher.c index 0b945808..ecf8f8ac 100644 --- a/src/library/blas/xher.c +++ b/src/library/blas/xher.c @@ -100,7 +100,7 @@ doher( kargs->A = A; kargs->lda.matrix = lda; kargs->B = X; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->offBX = offx; kargs->offa = offa; kargs->offA = offa; diff --git a/src/library/blas/xher2.c b/src/library/blas/xher2.c index a6de9fe7..05027adc 100644 --- a/src/library/blas/xher2.c +++ b/src/library/blas/xher2.c @@ -101,20 +101,20 @@ doHer2( { kargs->uplo = (uplo == clblasUpper) ? clblasLower : clblasUpper; kargs->B = Y; - kargs->ldb.vector = incy; + kargs->ldb.Vector = incy; kargs->offBX = offy; kargs->C = X; - kargs->ldc.vector = incx; + kargs->ldc.Vector = incx; kargs->offCY = offx; } else { kargs->uplo = uplo; kargs->B = X; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->offBX = offx; kargs->C = Y; - kargs->ldc.vector = incy; + kargs->ldc.Vector = incy; kargs->offCY = offy; } kargs->N = N; diff --git a/src/library/blas/xhpmv.c b/src/library/blas/xhpmv.c index 1f0fe67b..1def97ce 100644 --- a/src/library/blas/xhpmv.c +++ b/src/library/blas/xhpmv.c @@ -85,10 +85,10 @@ doHpmv( kargs->lda.matrix = 0; // Set lda as zero for packed matrices kargs->B = X; kargs->offBX = offx; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->C = Y; kargs->offCY = offy; - kargs->ldc.vector = incy; + kargs->ldc.Vector = incy; kargs->transA = clblasNoTrans; kargs->diag = clblasNonUnit; diff --git a/src/library/blas/xnrm2.c b/src/library/blas/xnrm2.c index 833d855b..fc3dd514 100644 --- a/src/library/blas/xnrm2.c +++ b/src/library/blas/xnrm2.c @@ -219,7 +219,7 @@ doNrm2( kargs->offa = offNRM2; kargs->B = X; kargs->offBX = offx; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; if(incx < 1) { // According to netlib, if incx<1, NRM2 will be zero kargs->N = 1; // Makeing it launch only 1 work-group } diff --git a/src/library/blas/xrot.c b/src/library/blas/xrot.c index cf9acc60..0ca5498a 100644 --- a/src/library/blas/xrot.c +++ b/src/library/blas/xrot.c @@ -95,10 +95,10 @@ doRot( kargs->N = N; kargs->A = X; kargs->offBX = offx; - kargs->ldb.vector = incx; // Will be using this as incx + kargs->ldb.Vector = incx; // Will be using this as incx kargs->B = Y; kargs->offCY = offy; - kargs->ldc.vector = incy; // Will be using this as incy + kargs->ldc.Vector = incy; // Will be using this as incy kargs->pigFuncID = CLBLAS_ROT; // Using ROTM kernel for ROT. Both are similar listInitHead(&seq); diff --git a/src/library/blas/xrotm.c b/src/library/blas/xrotm.c index dad78d8b..67adb04e 100644 --- a/src/library/blas/xrotm.c +++ b/src/library/blas/xrotm.c @@ -103,10 +103,10 @@ doRotm( kargs->N = N; kargs->A = X; kargs->offBX = offx; - kargs->ldb.vector = incx; // Will be using this as incx + kargs->ldb.Vector = incx; // Will be using this as incx kargs->B = Y; kargs->offCY = offy; - kargs->ldc.vector = incy; // Will be using this as incy + kargs->ldc.Vector = incy; // Will be using this as incy kargs->D = param; kargs->offd = offParam; kargs->pigFuncID = CLBLAS_ROTM; diff --git a/src/library/blas/xscal.c b/src/library/blas/xscal.c index 08bcb611..17560166 100644 --- a/src/library/blas/xscal.c +++ b/src/library/blas/xscal.c @@ -87,7 +87,7 @@ doScal( kargs->N = N; kargs->A = X; kargs->offBX = offx; - kargs->ldb.vector = incx; // Will be using this as incx + kargs->ldb.Vector = incx; // Will be using this as incx if(incx < 0) { // According to Netlib - return for negative incx return clblasSuccess; diff --git a/src/library/blas/xshbmv.c b/src/library/blas/xshbmv.c index 94f733da..7657b4e7 100644 --- a/src/library/blas/xshbmv.c +++ b/src/library/blas/xshbmv.c @@ -100,10 +100,10 @@ doSHbmv( kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->C = y; kargs->offCY = offy; - kargs->ldc.vector = incy; + kargs->ldc.Vector = incy; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GBMV, kargs, numCommandQueues, commandQueues, diff --git a/src/library/blas/xspmv.c b/src/library/blas/xspmv.c index b40e0269..2ce10158 100644 --- a/src/library/blas/xspmv.c +++ b/src/library/blas/xspmv.c @@ -85,10 +85,10 @@ doSpmv( kargs->lda.matrix = 0; // Set lda as zero for packed matrices kargs->B = X; kargs->offBX = offx; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->C = Y; kargs->offCY = offy; - kargs->ldc.vector = incy; + kargs->ldc.Vector = incy; kargs->transA = clblasNoTrans; kargs->diag = clblasNonUnit; diff --git a/src/library/blas/xswap.c b/src/library/blas/xswap.c index 006053b7..92163125 100644 --- a/src/library/blas/xswap.c +++ b/src/library/blas/xswap.c @@ -96,10 +96,10 @@ doSwap( kargs->N = N; kargs->A = X; kargs->offBX = offx; - kargs->ldb.vector = incx; // Will be using this as incx + kargs->ldb.Vector = incx; // Will be using this as incx kargs->B = Y; kargs->offCY = offy; - kargs->ldc.vector = incy; // Will be using this as incy + kargs->ldc.Vector = incy; // Will be using this as incy #ifdef DEBUG_SWAP printf("Calling makeSolutionSeq from DoSwap: SWAP\n"); diff --git a/src/library/blas/xsymv.c b/src/library/blas/xsymv.c index 790e8720..e65a1e54 100644 --- a/src/library/blas/xsymv.c +++ b/src/library/blas/xsymv.c @@ -84,10 +84,10 @@ doSymv( kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->C = y; kargs->offCY = offy; - kargs->ldc.vector = incy; + kargs->ldc.Vector = incy; #ifndef USE_SYMV diff --git a/src/library/blas/xsyr.c b/src/library/blas/xsyr.c index 99b47b4f..fc3d9e57 100644 --- a/src/library/blas/xsyr.c +++ b/src/library/blas/xsyr.c @@ -104,7 +104,7 @@ doSyr( kargs->A = A; kargs->lda.matrix = lda; kargs->B = X; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->offBX = offx; kargs->offa = offa; kargs->offA = offa; diff --git a/src/library/blas/xsyr2.c b/src/library/blas/xsyr2.c index 37a31eed..0ca71e87 100644 --- a/src/library/blas/xsyr2.c +++ b/src/library/blas/xsyr2.c @@ -109,10 +109,10 @@ doSyr2( kargs->A = A; kargs->lda.matrix = lda; kargs->B = X; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->offBX = offx; kargs->C = Y; - kargs->ldc.vector = incy; + kargs->ldc.Vector = incy; kargs->offCY = offy; kargs->offa = offa; kargs->offA = offa; diff --git a/src/library/blas/xtbmv.c b/src/library/blas/xtbmv.c index 83322e55..a927b81e 100644 --- a/src/library/blas/xtbmv.c +++ b/src/library/blas/xtbmv.c @@ -143,9 +143,9 @@ doTbmv( kargs->A = A; kargs->lda.matrix = lda; kargs->B = y; // Now it becomes x = A * y - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->C = x; - kargs->ldc.vector = incx; + kargs->ldc.Vector = incx; kargs->offBX = 0; // Not used by assignKargs(); Just for clarity kargs->offCY = offx; kargs->offa = offa; diff --git a/src/library/blas/xtbsv.c b/src/library/blas/xtbsv.c index a24d74c5..44f00bdc 100644 --- a/src/library/blas/xtbsv.c +++ b/src/library/blas/xtbsv.c @@ -153,15 +153,15 @@ if (err == CL_SUCCESS) gbmv->args.offA += kargs->offA; gbmv->args.offa = gbmv->args.offA; - if(kargs->ldb.vector < 0) + if(kargs->ldb.Vector < 0) { - gbmv->args.offBX = kargs->offBX + ((i-1) * TARGET_ROWS) * abs(kargs->ldb.vector); - gbmv->args.offCY = kargs->offBX + ((i * TARGET_ROWS) ) * abs(kargs->ldb.vector); + gbmv->args.offBX = kargs->offBX + ((i-1) * TARGET_ROWS) * abs(kargs->ldb.Vector); + gbmv->args.offCY = kargs->offBX + ((i * TARGET_ROWS) ) * abs(kargs->ldb.Vector); } else { - gbmv->args.offBX = kargs->offBX + (trtri->args.startRow) * kargs->ldb.vector; - gbmv->args.offCY = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.vector; + gbmv->args.offBX = kargs->offBX + (trtri->args.startRow) * kargs->ldb.Vector; + gbmv->args.offCY = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.Vector; } } else { @@ -176,15 +176,15 @@ if (err == CL_SUCCESS) gbmv->args.offA -= gbmv->args.KL; gbmv->args.offA += kargs->offA; gbmv->args.offa = gbmv->args.offA; - if(kargs->ldb.vector < 0) + if(kargs->ldb.Vector < 0) { - gbmv->args.offBX = kargs->offBX + (kargs->N - gbmv->args.startRow) * abs(kargs->ldb.vector); - gbmv->args.offCY = kargs->offBX + (kargs->N - (gbmv->args.startRow + gbmv->args.M) ) * abs(kargs->ldb.vector); + gbmv->args.offBX = kargs->offBX + (kargs->N - gbmv->args.startRow) * abs(kargs->ldb.Vector); + gbmv->args.offCY = kargs->offBX + (kargs->N - (gbmv->args.startRow + gbmv->args.M) ) * abs(kargs->ldb.Vector); } else { - gbmv->args.offBX = kargs->offBX + (trtri->args.startRow) * kargs->ldb.vector; - gbmv->args.offCY = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.vector; + gbmv->args.offBX = kargs->offBX + (trtri->args.startRow) * kargs->ldb.Vector; + gbmv->args.offCY = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.Vector; } } @@ -388,15 +388,15 @@ offa = r * lda + col - (r - k); gbmv->args.offA += kargs->offA; gbmv->args.offa = gbmv->args.offA; - if(kargs->ldb.vector < 0) + if(kargs->ldb.Vector < 0) { - gbmv->args.offBX = kargs->offBX + (kargs->N - (gbmv->args.endRow)) * abs(kargs->ldb.vector); - gbmv->args.offCY = kargs->offBX + (kargs->N - (gbmv->args.endRow + gbmv->args.N) ) * abs(kargs->ldb.vector); + gbmv->args.offBX = kargs->offBX + (kargs->N - (gbmv->args.endRow)) * abs(kargs->ldb.Vector); + gbmv->args.offCY = kargs->offBX + (kargs->N - (gbmv->args.endRow + gbmv->args.N) ) * abs(kargs->ldb.Vector); } else { - gbmv->args.offBX = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.vector; - gbmv->args.offCY = kargs->offBX + (gbmv->args.endRow) * kargs->ldb.vector; + gbmv->args.offBX = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.Vector; + gbmv->args.offCY = kargs->offBX + (gbmv->args.endRow) * kargs->ldb.Vector; } @@ -416,15 +416,15 @@ offa = r * lda + col - (r - k); gbmv->args.offA -= gbmv->args.KL; gbmv->args.offA += kargs->offA; gbmv->args.offa = gbmv->args.offA; - if(kargs->ldb.vector < 0) + if(kargs->ldb.Vector < 0) { - gbmv->args.offBX = kargs->offBX + (kargs->N - gbmv->args.endRow) * abs(kargs->ldb.vector); - gbmv->args.offCY = kargs->offBX + (kargs->N - (gbmv->args.startRow) ) * abs(kargs->ldb.vector); + gbmv->args.offBX = kargs->offBX + (kargs->N - gbmv->args.endRow) * abs(kargs->ldb.Vector); + gbmv->args.offCY = kargs->offBX + (kargs->N - (gbmv->args.startRow) ) * abs(kargs->ldb.Vector); } else { - gbmv->args.offBX = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.vector; - gbmv->args.offCY = kargs->offBX + (gbmv->args.startRow - gbmv->args.N) * kargs->ldb.vector; + gbmv->args.offBX = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.Vector; + gbmv->args.offCY = kargs->offBX + (gbmv->args.startRow - gbmv->args.N) * kargs->ldb.Vector; } } @@ -626,13 +626,13 @@ doTbsv( kargs->A = A; kargs->lda.matrix = lda; kargs->B = x; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->offBX = offx; kargs->offa = offa; kargs->offA = offa; kargs->C = x; kargs->offCY = offx; - kargs->ldc.vector = incx; + kargs->ldc.Vector = incx; kargs->startRow = 0; if(trans == clblasNoTrans) diff --git a/src/library/blas/xtrmv.c b/src/library/blas/xtrmv.c index 6ff205b9..b4126941 100644 --- a/src/library/blas/xtrmv.c +++ b/src/library/blas/xtrmv.c @@ -132,9 +132,9 @@ doTrmv( kargs->A = A; kargs->lda.matrix = lda; kargs->B = x; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->C = y; - kargs->ldc.vector = incx; + kargs->ldc.Vector = incx; kargs->offBX = offx; kargs->offCY = 0; // Not used by assignKargs(); Just for clarity kargs->offa = offa; diff --git a/src/library/blas/xtrsv.c b/src/library/blas/xtrsv.c index 1e48349a..c75dadb6 100644 --- a/src/library/blas/xtrsv.c +++ b/src/library/blas/xtrsv.c @@ -406,7 +406,7 @@ doTrsv( kargs->A = A; kargs->lda.matrix = lda; kargs->B = x; - kargs->ldb.vector = incx; + kargs->ldb.Vector = incx; kargs->offBX = offx; kargs->offa = offa; kargs->offA = offa; diff --git a/src/library/tools/ktest/config.cpp b/src/library/tools/ktest/config.cpp index 3098e369..8b201288 100644 --- a/src/library/tools/ktest/config.cpp +++ b/src/library/tools/ktest/config.cpp @@ -489,13 +489,13 @@ Config::setLDC(size_t ldc) void Config::setIncX(int incx) { - kargs_.ldb.vector = incx; + kargs_.ldb.Vector = incx; } void Config::setIncY(int incy) { - kargs_.ldc.vector = incy; + kargs_.ldc.Vector = incy; } void diff --git a/src/library/tools/ktest/steps/gemv.cpp b/src/library/tools/ktest/steps/gemv.cpp index c11e3aee..19327ce9 100644 --- a/src/library/tools/ktest/steps/gemv.cpp +++ b/src/library/tools/ktest/steps/gemv.cpp @@ -45,8 +45,8 @@ GemvStep::declareVars(Step *masterStep) args.N = addConst("N", "cl_uint", kargs().N); args.lda = addConst("lda", "cl_uint", kargs().lda.matrix); - args.ldb = addConst("incx", "cl_int", kargs().ldb.vector); - args.ldc = addConst("incy", "cl_int", kargs().ldc.vector); + args.ldb = addConst("incx", "cl_int", kargs().ldb.Vector); + args.ldc = addConst("incy", "cl_int", kargs().ldc.Vector); args.offA = addConst("offA", "cl_uint", kargs().offA); args.offBX = addConst("offX", "cl_uint", kargs().offBX); @@ -128,11 +128,11 @@ GemvStep::fixLD() args.lda.matrix = args.N; } - if (args.ldb.vector == 0) { - args.ldb.vector = 1; + if (args.ldb.Vector == 0) { + args.ldb.Vector = 1; } - if (args.ldc.vector == 0) { - args.ldc.vector = 1; + if (args.ldc.Vector == 0) { + args.ldc.Vector = 1; } /* * store original height of the matrix A diff --git a/src/library/tools/ktest/steps/symv.cpp b/src/library/tools/ktest/steps/symv.cpp index bfe22302..4e24e95b 100644 --- a/src/library/tools/ktest/steps/symv.cpp +++ b/src/library/tools/ktest/steps/symv.cpp @@ -45,8 +45,8 @@ SymvStep::declareVars(Step *masterStep) args.K = args.N; args.lda = addConst("lda", "cl_uint", kargs().lda.matrix); - args.ldb = addConst("incx", "cl_int", kargs().ldb.vector); - args.ldc = addConst("incy", "cl_int", kargs().ldc.vector); + args.ldb = addConst("incx", "cl_int", kargs().ldb.Vector); + args.ldc = addConst("incy", "cl_int", kargs().ldc.Vector); args.offsetN = addConst("offsetN", "cl_uint", kargs().offsetN); @@ -108,11 +108,11 @@ SymvStep::fixLD() args.lda.matrix = args.N; } - if (args.ldb.vector == 0) { - args.ldb.vector = 1; + if (args.ldb.Vector == 0) { + args.ldb.Vector = 1; } - if (args.ldc.vector == 0) { - args.ldc.vector = 1; + if (args.ldc.Vector == 0) { + args.ldc.Vector = 1; } args.K = args.N; //store original N From 00a29c6409c9d269e3d680026faf747d0abf85a3 Mon Sep 17 00:00:00 2001 From: tingxingdong Date: Mon, 20 Jun 2016 12:57:24 -0500 Subject: [PATCH 36/45] allow users to easily verify the gemm/trmm GPU results with the netlib cblas through client (#274) * (1)update readme: netlib is preferred.(2)now you can verify the correctness of gemm&trmm through client * give more details of how to get CBLAS on windows * find the netlib library dir & library in Cmake files * forget to add this file * disable the validation on windows currently: no easy solution of building/linking netlib CBLAS on windows --- README.md | 6 +- src/CMakeLists.txt | 8 + src/FindNetlib.cmake | 19 + src/client/CMakeLists.txt | 9 +- src/client/clfunc_common.hpp | 98 +++- src/client/clfunc_xgemm.hpp | 579 ++++++++++++--------- src/client/clfunc_xtrmm.hpp | 266 +++++++--- src/client/client.cpp | 967 ++++++++++++++++++----------------- 8 files changed, 1130 insertions(+), 822 deletions(-) diff --git a/README.md b/README.md index cd734da4..8de7d7ec 100644 --- a/README.md +++ b/README.md @@ -197,8 +197,12 @@ The simple example below shows how to use clBLAS to compute an OpenCL accelerate ### Test infrastructure * Googletest v1.6 -* ACML on windows/linux; Accelerate on Mac OSX * Latest Boost +* CPU BLAS + - Netlib CBLAS (recommended) + Ubuntu: install by "apt-get install libblas-dev" + Windows: download & install lapack-3.6.0 which comes with CBLAS + - or ACML on windows/linux; Accelerate on Mac OSX ### Performance infrastructure * Python diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 33a91ee2..73ba594e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -265,6 +265,14 @@ if( BUILD_TEST ) endif( ) endif( ) +if( BUILD_CLIENT ) + if( NETLIB_FOUND ) + else( ) + message( WARNING "Not find Netlib; BUILD_CLIENT needs the Netlib CBLAS library" ) + endif() +endif() + + # This will define OPENCL_FOUND find_package( OpenCL ${OPENCL_VERSION} ) diff --git a/src/FindNetlib.cmake b/src/FindNetlib.cmake index a32474ed..6a21e613 100644 --- a/src/FindNetlib.cmake +++ b/src/FindNetlib.cmake @@ -100,6 +100,25 @@ if( NOT contains_BLAS EQUAL -1 ) FIND_PACKAGE_HANDLE_STANDARD_ARGS( NETLIB DEFAULT_MSG Netlib_BLAS_LIBRARY ) endif( ) + +#look for netlib cblas header +if( UNIX ) + find_path(Netlib_INCLUDE_DIRS cblas.h + HINTS + /usr/include + ) +else() + find_path(Netlib_INCLUDE_DIRS cblas.h + HINTS + ${Netlib_ROOT}/CBLAS/include/ + ) +endif() + +if( Netlib_INCLUDE_DIRS ) +else() + message(WARNING "Cannot find cblas.h") +endif() + if( NETLIB_FOUND ) list( APPEND Netlib_LIBRARIES ${Netlib_BLAS_LIBRARY} ) else( ) diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt index eb66f8a0..752b19b3 100644 --- a/src/client/CMakeLists.txt +++ b/src/client/CMakeLists.txt @@ -1,12 +1,12 @@ # ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -48,10 +48,11 @@ include_directories( ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/include ${clBLAS_SOURCE_DIR}/tests/include + ${Netlib_INCLUDE_DIRS} .) add_executable(client ${CLIENT_SRC} ${CLIENT_HEADER}) -target_link_libraries(client ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS) +target_link_libraries(client ${Netlib_LIBRARIES} ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS) set_target_properties( client PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" OUTPUT_NAME clBLAS-client ) diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp index fc2057ba..0f22ef0f 100644 --- a/src/client/clfunc_common.hpp +++ b/src/client/clfunc_common.hpp @@ -27,6 +27,11 @@ #include "test-limits.h" #include "dis_warning.h" +#if defined ( _WIN32 ) || defined ( _WIN64 ) +#else +#include "cblas.h" +#endif + #include "clBLAS.h" #if defined(__APPLE__) || defined(__MACOSX) #include @@ -77,6 +82,57 @@ randomScale() return t; } +#if defined ( _WIN32 ) || defined ( _WIN64 ) +#else + +CBLAS_ORDER +clblasToCblas_order(clblasOrder value) +{ + switch (value) { + case clblasRowMajor: return CblasRowMajor; + case clblasColumnMajor: return CblasColMajor; + } +} + +CBLAS_TRANSPOSE +clblasToCblas_operation(clblasTranspose value) +{ + switch (value) { + case clblasNoTrans: return CblasNoTrans; + case clblasTrans: return CblasTrans; + case clblasConjTrans: return CblasConjTrans; + } +} + +CBLAS_UPLO +clblasToCblas_fill(clblasUplo value) +{ + switch (value) { + case clblasUpper: return CblasUpper; + case clblasLower: return CblasLower; + } +} + +CBLAS_SIDE +clblasToCblas_side(clblasSide value) +{ + switch (value) { + case clblasLeft: return CblasLeft; + case clblasRight: return CblasRight; + } +} + +CBLAS_DIAG +clblasToCblas_diag(clblasDiag value) +{ + switch (value) { + case clblasNonUnit: return CblasNonUnit; + case clblasUnit: return CblasUnit; + } +} + +#endif + std::string prettyPrintClStatus( const cl_int& status ) { @@ -269,7 +325,7 @@ class clblasFunc virtual ~clblasFunc() { clblasTeardown(); - + for (unsigned int i = 0; i < numQueues; i++) { OPENCL_V_THROW( clReleaseCommandQueue(queues_[i]), "releasing command queue" ); } @@ -278,21 +334,21 @@ class clblasFunc void wait_and_check() { - cl_int err; + cl_int err; cl_int wait_status = clWaitForEvents(1, &event_); if( wait_status != CL_SUCCESS ) { - if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ) - { - clGetEventInfo( event_, CL_EVENT_COMMAND_EXECUTION_STATUS, + if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ) + { + clGetEventInfo( event_, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &err, NULL ); - std::cout << "blas function execution status error: " << err << std::endl; + std::cout << "blas function execution status error: " << err << std::endl; exit(1); - } + } else { - std::cout << "blas function wait status error: " << wait_status << std::endl; + std::cout << "blas function wait status error: " << wait_status << std::endl; exit(1); } } @@ -300,14 +356,16 @@ class clblasFunc double time_in_ns() { - StatisticalTimer& timer = StatisticalTimer::getInstance( ); + StatisticalTimer& timer = StatisticalTimer::getInstance( ); return timer.getAverageTime( timer_id ) * 1e9; } + virtual void validate_with_cblas(int v) {} + virtual void call_func() = 0; virtual double gflops() = 0; virtual std::string gflops_formula() = 0; - virtual void setup_apiCallCount(cl_uint apiCallCount){} + virtual void setup_apiCallCount(cl_uint apiCallCount){} virtual void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, @@ -317,20 +375,20 @@ class clblasFunc virtual void initialize_cpu_buffer() = 0; virtual void initialize_gpu_buffer() = 0; virtual void reset_gpu_write_buffer() = 0; - virtual void read_gpu_buffer() = 0; - virtual void roundtrip_func() = 0; - virtual void roundtrip_func_rect() {} - virtual void allochostptr_roundtrip_func() {} - virtual void usehostptr_roundtrip_func() {} - virtual void copyhostptr_roundtrip_func() {} - virtual void usepersismem_roundtrip_func() {} - virtual void roundtrip_setup_buffer(int order_option, int side_option, + virtual void read_gpu_buffer() = 0; + virtual void roundtrip_func() = 0; + virtual void roundtrip_func_rect() {} + virtual void allochostptr_roundtrip_func() {} + virtual void usehostptr_roundtrip_func() {} + virtual void copyhostptr_roundtrip_func() {} + virtual void usepersismem_roundtrip_func() {} + virtual void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) = 0; - virtual void releaseGPUBuffer_deleteCPUBuffer()=0; + virtual void releaseGPUBuffer_deleteCPUBuffer()=0; StatisticalTimer& timer; StatisticalTimer::sTimerID timer_id; @@ -347,7 +405,7 @@ class clblasFunc clblasOrder order_; cl_event event_; size_t maxMemAllocSize; + int validate_; }; // class clblasFunc #endif // ifndef CLBLAS_BENCHMARK_COMMON_HXX__ - diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp index 8efaf639..57c283de 100644 --- a/src/client/clfunc_xgemm.hpp +++ b/src/client/clfunc_xgemm.hpp @@ -43,14 +43,16 @@ struct xGemmBuffer T* a_; T* b_; T* c_; + T* c_copy; cl_mem buf_a_; cl_mem buf_b_; cl_mem buf_c_; T alpha_; T beta_; - cl_uint apiCallCount; + cl_uint apiCallCount; }; // struct buffer + template class xGemm : public clblasFunc { @@ -68,20 +70,37 @@ class xGemm : public clblasFunc void call_func() { - timer.Start(timer_id); - xGemm_Function(true, buffer_.apiCallCount); - timer.Stop(timer_id); + timer.Start(timer_id); + xGemm_Function(true, buffer_.apiCallCount); + timer.Stop(timer_id); + } + + + void validate_with_cblas(int validate) + { + #if defined ( _WIN32 ) || defined ( _WIN64 ) + #else + if(validate) + { + initialize_cpu_buffer(); + initialize_gpu_buffer(); + xGemm_Function(true, 1); + read_gpu_buffer(); + validation(); + } + #endif } + double gflops() { - return (2.0*buffer_.m_*buffer_.n_*buffer_.k_) / (time_in_ns() / buffer_.apiCallCount); + return (2.0*buffer_.m_*buffer_.n_*buffer_.k_) / (time_in_ns() / buffer_.apiCallCount); } - void setup_apiCallCount(cl_uint apiCallCount) - { - buffer_.apiCallCount = apiCallCount; - } + void setup_apiCallCount(cl_uint apiCallCount) + { + buffer_.apiCallCount = apiCallCount; + } std::string gflops_formula() { return "2.0*M*N*K/time"; @@ -322,6 +341,7 @@ class xGemm : public clblasFunc buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ]; + buffer_.c_copy = new T[buffer_.ldc_*buffer_.c_num_vectors_ ]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, @@ -366,7 +386,7 @@ class xGemm : public clblasFunc { for (size_t j = 0; j < buffer_.ldc_; ++j) { - buffer_.c_[i*buffer_.ldc_+j] = random(UPPER_BOUND()) / + buffer_.c_copy[i*buffer_.ldc_+j] = buffer_.c_[i*buffer_.ldc_+j] = random(UPPER_BOUND()) / randomScale(); } } @@ -375,7 +395,7 @@ class xGemm : public clblasFunc void initialize_gpu_buffer() { - cl_int err; + cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), @@ -408,19 +428,19 @@ class xGemm : public clblasFunc buffer_.c_, 0, NULL, NULL); } - void read_gpu_buffer() - { - cl_int err; - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * + void read_gpu_buffer() + { + cl_int err; + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, + buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), - buffer_.c_, 0, NULL, NULL); - } + buffer_.c_, 0, NULL, NULL); + } - void roundtrip_func() - { - timer.Start(timer_id); - cl_int err; + void roundtrip_func() + { + timer.Start(timer_id); + cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), @@ -452,42 +472,42 @@ class xGemm : public clblasFunc buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL); - xGemm_Function(false); - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * + xGemm_Function(false); + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, + buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), - buffer_.c_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); - } - void roundtrip_func_rect() - { - timer.Start(timer_id); - cl_int err; - //rect - size_t a_buffer_origin[3] = {0,0,0}; - size_t a_host_origin[3] = {0,0,0}; - size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1}; - size_t a_buffer_row_pitch=0*sizeof(T);//lda - size_t a_buffer_slice_pitch=0; - size_t a_host_row_pitch=buffer_.lda_*sizeof(T); - size_t a_host_slice_pitch=0; - - size_t b_buffer_origin[3] = {0,0,0}; - size_t b_host_origin[3] = {0,0,0}; - size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1}; - size_t b_buffer_row_pitch=0*sizeof(T);//ldb - size_t b_buffer_slice_pitch=0; - size_t b_host_row_pitch=buffer_.ldb_*sizeof(T); - size_t b_host_slice_pitch=0; - - size_t c_buffer_origin[3] = {0,0,0}; - size_t c_host_origin[3] = {0,0,0}; - size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1}; - size_t c_buffer_row_pitch=0*sizeof(T);//ldc - size_t c_buffer_slice_pitch=0; - size_t c_host_row_pitch=buffer_.ldc_*sizeof(T); - size_t c_host_slice_pitch=0; + buffer_.c_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); + } + void roundtrip_func_rect() + { + timer.Start(timer_id); + cl_int err; + //rect + size_t a_buffer_origin[3] = {0,0,0}; + size_t a_host_origin[3] = {0,0,0}; + size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1}; + size_t a_buffer_row_pitch=0*sizeof(T);//lda + size_t a_buffer_slice_pitch=0; + size_t a_host_row_pitch=buffer_.lda_*sizeof(T); + size_t a_host_slice_pitch=0; + + size_t b_buffer_origin[3] = {0,0,0}; + size_t b_host_origin[3] = {0,0,0}; + size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1}; + size_t b_buffer_row_pitch=0*sizeof(T);//ldb + size_t b_buffer_slice_pitch=0; + size_t b_host_row_pitch=buffer_.ldb_*sizeof(T); + size_t b_host_slice_pitch=0; + + size_t c_buffer_origin[3] = {0,0,0}; + size_t c_host_origin[3] = {0,0,0}; + size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1}; + size_t c_buffer_row_pitch=0*sizeof(T);//ldc + size_t c_buffer_slice_pitch=0; + size_t c_host_row_pitch=buffer_.ldc_*sizeof(T); + size_t c_host_slice_pitch=0; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.k_*buffer_.m_ + @@ -504,12 +524,12 @@ class xGemm : public clblasFunc buffer_.offC_) * sizeof(T), NULL, &err); /* - err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, + err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); - + err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * @@ -522,47 +542,47 @@ class xGemm : public clblasFunc sizeof(T), buffer_.c_, 0, NULL, NULL);*/ err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_a_, CL_TRUE, a_buffer_origin, a_host_origin, a_region, a_buffer_row_pitch, - a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL); + a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_b_, CL_TRUE, b_buffer_origin, b_host_origin, b_region, b_buffer_row_pitch, - b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL); + b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL); err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch, - c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL); - - if(buffer_.trans_a_==clblasNoTrans) - { - buffer_.lda_=buffer_.m_; - } - else - { - buffer_.lda_=buffer_.k_; - } - if(buffer_.trans_b_==clblasNoTrans) - { - buffer_.ldb_=buffer_.k_; - } - else - { - buffer_.ldb_=buffer_.n_; - } - buffer_.ldc_=buffer_.m_; - xGemm_Function(false); - /* - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * + c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL); + + if(buffer_.trans_a_==clblasNoTrans) + { + buffer_.lda_=buffer_.m_; + } + else + { + buffer_.lda_=buffer_.k_; + } + if(buffer_.trans_b_==clblasNoTrans) + { + buffer_.ldb_=buffer_.k_; + } + else + { + buffer_.ldb_=buffer_.n_; + } + buffer_.ldc_=buffer_.m_; + xGemm_Function(false); + /* + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, + buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), - buffer_.c_, 0, NULL, &event_); - */ - err = ::clEnqueueReadBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch, - c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); - } - void allochostptr_roundtrip_func() - { - timer.Start(timer_id); - - cl_int err; - // Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy + buffer_.c_, 0, NULL, &event_); + */ + err = ::clEnqueueReadBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch, + c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); + } + void allochostptr_roundtrip_func() + { + timer.Start(timer_id); + + cl_int err; + // Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), @@ -578,45 +598,45 @@ class xGemm : public clblasFunc buffer_.offC_) * sizeof(T), NULL, &err); - // map the buffers to pointers at host device - T *map_a,*map_b,*map_c; - map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.lda_*buffer_.a_num_vectors_ + + // map the buffers to pointers at host device + T *map_a,*map_b,*map_c; + map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, + (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), - 0, NULL, NULL, &err); - map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.ldb_*buffer_.b_num_vectors_ + + 0, NULL, NULL, &err); + map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, + (buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), - 0, NULL, NULL, &err); - map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.lda_*buffer_.c_num_vectors_ + + 0, NULL, NULL, &err); + map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, + (buffer_.lda_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), - 0, NULL, NULL, &err); - // memcpy the input A, B, C to the host pointers - memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); - memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); - memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); - // unmap the buffers - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL); - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL); - // calling clBLAS - xGemm_Function(false); - // map the C buffer again to read output - map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, - (buffer_.lda_*buffer_.c_num_vectors_ + + 0, NULL, NULL, &err); + // memcpy the input A, B, C to the host pointers + memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); + memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); + memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); + // unmap the buffers + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL); + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL); + // calling clBLAS + xGemm_Function(false); + // map the C buffer again to read output + map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, + (buffer_.lda_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), - 0, NULL, NULL, &err); - memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_); - clWaitForEvents(1, &event_); - - timer.Stop(timer_id); - } - void usehostptr_roundtrip_func() - { - timer.Start(timer_id); - cl_int err; + 0, NULL, NULL, &err); + memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_); + clWaitForEvents(1, &event_); + + timer.Stop(timer_id); + } + void usehostptr_roundtrip_func() + { + timer.Start(timer_id); + cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), @@ -631,13 +651,13 @@ class xGemm : public clblasFunc (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), buffer_.c_, &err); - xGemm_Function(true); - timer.Stop(timer_id); - } - void copyhostptr_roundtrip_func() - { - timer.Start(timer_id); - cl_int err; + xGemm_Function(true); + timer.Stop(timer_id); + } + void copyhostptr_roundtrip_func() + { + timer.Start(timer_id); + cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), @@ -652,20 +672,20 @@ class xGemm : public clblasFunc (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), buffer_.c_, &err); - xGemm_Function(false); - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * + xGemm_Function(false); + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, + buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), - buffer_.c_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); - } - void usepersismem_roundtrip_func() - { + buffer_.c_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); + } + void usepersismem_roundtrip_func() + { #if defined(CL_MEM_USE_PERSISTENT_MEM_AMD) - timer.Start(timer_id); + timer.Start(timer_id); - cl_int err; + cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD, (buffer_.lda_*buffer_.a_num_vectors_ + @@ -682,46 +702,46 @@ class xGemm : public clblasFunc buffer_.offC_) * sizeof(T), NULL, &err); - // map the buffers to pointers at host devices - T *map_a,*map_b,*map_c; - map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.lda_*buffer_.a_num_vectors_ + + // map the buffers to pointers at host devices + T *map_a,*map_b,*map_c; + map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, + (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), - 0, NULL, NULL, &err); - map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.ldb_*buffer_.b_num_vectors_ + + 0, NULL, NULL, &err); + map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, + (buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), - 0, NULL, NULL, &err); - map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.lda_*buffer_.c_num_vectors_ + + 0, NULL, NULL, &err); + map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, + (buffer_.lda_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), - 0, NULL, NULL, &err); - // memcpy the input A, B, C to the host pointers - memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); - memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); - memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); - // unmap the buffers - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL); - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL); - // calling clBLAS - xGemm_Function(false); - // map the C buffer again to read output - map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, - (buffer_.lda_*buffer_.c_num_vectors_ + + 0, NULL, NULL, &err); + // memcpy the input A, B, C to the host pointers + memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); + memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); + memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); + // unmap the buffers + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL); + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL); + // calling clBLAS + xGemm_Function(false); + // map the C buffer again to read output + map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, + (buffer_.lda_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), - 0, NULL, NULL, &err); - memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_); - clWaitForEvents(1, &event_); + 0, NULL, NULL, &err); + memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_); + clWaitForEvents(1, &event_); - timer.Stop(timer_id); + timer.Stop(timer_id); #else - std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"< buffer_; - void xGemm_Function(bool flush, cl_uint apiCallCount = 1); - unsigned int numQueuesToUse; - cl_event events_[numQueues]; + void xGemm_Function(bool flush, cl_uint apiCallCount = 1); + unsigned int numQueuesToUse; + cl_event events_[numQueues]; +#if defined ( _WIN32 ) || defined ( _WIN64 ) +#else + void validation(); +#endif }; // class xgemm template<> -void +void xGemm:: xGemm_Function(bool flush, cl_uint apiCallCount ) { for (unsigned int i = 0; i < numQueues; i++) { events_[i] = NULL; } - for (unsigned int i = 0; i < apiCallCount; i++) - { - clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_, - buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, - buffer_.buf_a_, buffer_.offA_, buffer_.lda_, - buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, - buffer_.beta_, buffer_.buf_c_, buffer_.offC_, - buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); - } - //flush==true if only the kernel time (library call) is timed - //flush==false if memory time is also timed - if (flush==true) - { + for (unsigned int i = 0; i < apiCallCount; i++) + { + clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_, + buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, + buffer_.buf_a_, buffer_.offA_, buffer_.lda_, + buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, + buffer_.beta_, buffer_.buf_c_, buffer_.offC_, + buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); + } + //flush==true if only the kernel time (library call) is timed + //flush==false if memory time is also timed + if (flush==true) + { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { @@ -1025,16 +1050,16 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } - + for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } - clWaitForEvents(numValidEvents, events_); - } + clWaitForEvents(numValidEvents, events_); + } } template<> -void +void xGemm:: xGemm_Function(bool flush, cl_uint apiCallCount ) { @@ -1042,18 +1067,18 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) - { - clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_, + { + clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } - //flush==true if only the kernel time (library call) is timed - //flush==false if memory time is also timed - if (flush==true) - { + //flush==true if only the kernel time (library call) is timed + //flush==false if memory time is also timed + if (flush==true) + { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { @@ -1070,16 +1095,16 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } - + for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } - clWaitForEvents(numValidEvents, events_); - } + clWaitForEvents(numValidEvents, events_); + } } template<> -void +void xGemm:: xGemm_Function(bool flush, cl_uint apiCallCount ) { @@ -1087,18 +1112,18 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) - { - clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_, + { + clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } - //flush==true if only the kernel time (library call) is timed - //flush==false if memory time is also timed - if (flush==true) - { + //flush==true if only the kernel time (library call) is timed + //flush==false if memory time is also timed + if (flush==true) + { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { @@ -1115,16 +1140,16 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } - + for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } - clWaitForEvents(numValidEvents, events_); - } + clWaitForEvents(numValidEvents, events_); + } } template<> -void +void xGemm:: xGemm_Function(bool flush, cl_uint apiCallCount ) { @@ -1132,18 +1157,18 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) - { - clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_, + { + clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } - //flush==true if only the kernel time (library call) is timed - //flush==false if memory time is also timed - if (flush==true) - { + //flush==true if only the kernel time (library call) is timed + //flush==false if memory time is also timed + if (flush==true) + { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { @@ -1164,8 +1189,8 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) clFlush(queues_[i]); } - clWaitForEvents(numValidEvents, events_); - } + clWaitForEvents(numValidEvents, events_); + } } template<> @@ -1200,4 +1225,84 @@ gflops_formula() return "8.0*M*N*K/time"; } +#if defined ( _WIN32 ) || defined (_WIN64 ) + +#else + +template<> +void +xGemm:: +validation() +{ + cblas_sgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_), + buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_ + buffer_.offB_, buffer_.ldb_, + buffer_.beta_, + buffer_.c_copy + buffer_.offC_, buffer_.ldc_); + + cblas_saxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.c_, 1, buffer_.c_copy, 1); + float norm_error = cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/ + cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1); + printf("Error of clblas_sgemm against cblas_sgemm = %f \n", norm_error); +} + +template<> +void +xGemm:: +validation() +{ + cblas_dgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_), + buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_ + buffer_.offB_, buffer_.ldb_, + buffer_.beta_, + buffer_.c_copy + buffer_.offC_, buffer_.ldc_); + + cblas_daxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.c_, 1, buffer_.c_copy, 1); + double norm_error = cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/ + cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1); + printf("Error of clblas_dgemm against cblas_dgemm = %f \n", norm_error); +} + +template<> +void +xGemm:: +validation() +{ + cblas_cgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_), + buffer_.m_, buffer_.n_, buffer_.k_, &(buffer_.alpha_), + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_ + buffer_.offB_, buffer_.ldb_, + &(buffer_.beta_), + buffer_.c_copy + buffer_.offC_, buffer_.ldc_); + + cl_float2 neg_one = makeScalar(-1.0); + cblas_caxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.c_, 1, buffer_.c_copy, 1); + float norm_error = cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/ + cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1); + printf("Error of clblas_cgemm against cblas_cgemm = %f \n", norm_error); +} + +template<> +void +xGemm:: +validation() +{ + cblas_zgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_), + buffer_.m_, buffer_.n_, buffer_.k_, &(buffer_.alpha_), + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_ + buffer_.offB_, buffer_.ldb_, + &(buffer_.beta_), + buffer_.c_copy + buffer_.offC_, buffer_.ldc_); + + cl_double2 neg_one = makeScalar(-1.0); + cblas_zaxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.c_, 1, buffer_.c_copy, 1); + double norm_error = cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/ + cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1); + printf("Error of clblas_zgemm against cblas_zgemm = %f \n", norm_error); +} + +#endif + #endif // ifndef CLBLAS_BENCHMARK_XGEMM_HXX__ diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp index 92d883cf..0cd1ff46 100644 --- a/src/client/clfunc_xtrmm.hpp +++ b/src/client/clfunc_xtrmm.hpp @@ -40,6 +40,7 @@ struct xTrmmBuffer clblasDiag diag_; T* a_; T* b_; + T* b_copy; cl_mem buf_a_; cl_mem buf_b_; T alpha_; @@ -64,6 +65,23 @@ class xTrmm : public clblasFunc std::cout << "xtrmm::call_func\n"; } + + void validate_with_cblas(int validate) + { + #if defined ( _WIN32 ) || defined ( _WIN64 ) + #else + if(validate) + { + initialize_cpu_buffer(); + initialize_gpu_buffer(); + call_func(); + read_gpu_buffer(); + validation(); + } + #endif + } + + double gflops() { if (buffer_.side_ == clblasLeft) @@ -225,6 +243,7 @@ class xTrmm : public clblasFunc buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; + buffer_.b_copy = new T[buffer_.ldb_*buffer_.b_num_vectors_]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, @@ -246,7 +265,7 @@ class xTrmm : public clblasFunc { for (size_t j = 0; j < buffer_.ldb_; ++j) { - buffer_.b_[i*buffer_.ldb_+j] = random(UPPER_BOUND()) / + buffer_.b_copy[i*buffer_.ldb_+j] = buffer_.b_[i*buffer_.ldb_+j] = random(UPPER_BOUND()) / randomScale(); } } @@ -294,29 +313,29 @@ class xTrmm : public clblasFunc sizeof(T), buffer_.b_, 0, NULL, NULL); } - void read_gpu_buffer() - { - cl_int err; - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * - sizeof(T), - buffer_.b_, 0, NULL, NULL); - } - void roundtrip_func() - { - std::cout << "xTrmm::roundtrip_func\n"; - } - void zerocopy_roundtrip_func() - { - std::cout << "xTrmm::zerocopy_roundtrip_func\n"; - } - void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, + void read_gpu_buffer() + { + cl_int err; + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, + buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * + sizeof(T), + buffer_.b_, 0, NULL, NULL); + } + void roundtrip_func() + { + std::cout << "xTrmm::roundtrip_func\n"; + } + void zerocopy_roundtrip_func() + { + std::cout << "xTrmm::zerocopy_roundtrip_func\n"; + } + void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) - { - DUMMY_ARGS_USAGE_3(transB_option, K, beta); + { + DUMMY_ARGS_USAGE_3(transB_option, K, beta); DUMMY_ARGS_USAGE_2(ldc, offCY); initialize_scalars(alpha, beta); @@ -447,18 +466,20 @@ class xTrmm : public clblasFunc buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; - } - void releaseGPUBuffer_deleteCPUBuffer() - { - //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) - //need to do this before we eventually hit the destructor + } + + void releaseGPUBuffer_deleteCPUBuffer() + { + //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) + //need to do this before we eventually hit the destructor delete buffer_.a_; delete buffer_.b_; + delete buffer_.b_copy; OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_), "releasing buffer B"); - } + } protected: void initialize_scalars(double alpha, double beta) { @@ -468,7 +489,10 @@ class xTrmm : public clblasFunc private: xTrmmBuffer buffer_; - +#if defined ( _WIN32 ) || defined ( _WIN64 ) +#else + void validation(); +#endif }; // class xTrmm template<> @@ -494,9 +518,9 @@ void xTrmm:: roundtrip_func() { - timer.Start(timer_id); - cl_int err; - //set up buffer + timer.Start(timer_id); + cl_int err; + //set up buffer buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_float), @@ -506,8 +530,8 @@ roundtrip_func() (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_float), NULL, &err); - //initialize gpu buffer - err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, + //initialize gpu buffer + err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(cl_float), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(cl_float), @@ -518,20 +542,20 @@ roundtrip_func() buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(cl_float), buffer_.b_, 0, NULL, NULL); - //call_func - clblasStrmm(order_, buffer_.side_, buffer_.uplo_, + //call_func + clblasStrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, NULL); - //read gpu buffer - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ * + //read gpu buffer + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, + buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(cl_float), - buffer_.b_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); + buffer_.b_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); } @@ -558,9 +582,9 @@ void xTrmm:: roundtrip_func() { - timer.Start(timer_id); - cl_int err; - //set up buffer + timer.Start(timer_id); + cl_int err; + //set up buffer buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_double), @@ -570,8 +594,8 @@ roundtrip_func() (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_double), NULL, &err); - //initialize gpu buffer - err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, + //initialize gpu buffer + err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(cl_double), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(cl_double), @@ -582,20 +606,20 @@ roundtrip_func() buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(cl_double), buffer_.b_, 0, NULL, NULL); - //call_func - clblasDtrmm(order_, buffer_.side_, buffer_.uplo_, + //call_func + clblasDtrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, NULL); - //read gpu buffer - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ * + //read gpu buffer + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, + buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(cl_double), - buffer_.b_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); + buffer_.b_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); } @@ -622,9 +646,9 @@ void xTrmm:: roundtrip_func() { - timer.Start(timer_id); - cl_int err; - //set up buffer + timer.Start(timer_id); + cl_int err; + //set up buffer buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_float2), @@ -634,8 +658,8 @@ roundtrip_func() (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_float2), NULL, &err); - //initialize gpu buffer - err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, + //initialize gpu buffer + err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(cl_float2), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(cl_float2), @@ -646,20 +670,20 @@ roundtrip_func() buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(cl_float2), buffer_.b_, 0, NULL, NULL); - //call_func - clblasCtrmm(order_, buffer_.side_, buffer_.uplo_, + //call_func + clblasCtrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, NULL); - //read gpu buffer - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ * + //read gpu buffer + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, + buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(cl_float2), - buffer_.b_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); + buffer_.b_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); } @@ -686,9 +710,9 @@ void xTrmm:: roundtrip_func() { - timer.Start(timer_id); - cl_int err; - //set up buffer + timer.Start(timer_id); + cl_int err; + //set up buffer buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_double2), @@ -698,8 +722,8 @@ roundtrip_func() (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_double2), NULL, &err); - //initialize gpu buffer - err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, + //initialize gpu buffer + err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(cl_double2), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(cl_double2), @@ -710,20 +734,20 @@ roundtrip_func() buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(cl_double2), buffer_.b_, 0, NULL, NULL); - //call_func - clblasZtrmm(order_, buffer_.side_, buffer_.uplo_, + //call_func + clblasZtrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, NULL); - //read gpu buffer - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ * + //read gpu buffer + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, + buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(cl_double2), - buffer_.b_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); + buffer_.b_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); } @@ -790,5 +814,89 @@ gflops_formula() } } +#if defined ( _WIN32 ) || defined ( _WIN64 ) +#else + +template<> +void +xTrmm:: +validation() +{ + cblas_strmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_), + clblasToCblas_fill(buffer_.uplo_), + clblasToCblas_operation(buffer_.trans_a_), + clblasToCblas_diag(buffer_.diag_), + buffer_.m_, buffer_.n_, buffer_.alpha_, + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_copy + buffer_.offB_, buffer_.ldb_); + + cblas_saxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.b_, 1, buffer_.b_copy, 1); + float norm_error = cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/ + cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1); + printf("Error of clblas_strmm against cblas_strmm = %f \n", norm_error); +} + + +template<> +void +xTrmm:: +validation() +{ + cblas_dtrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_), + clblasToCblas_fill(buffer_.uplo_), + clblasToCblas_operation(buffer_.trans_a_), + clblasToCblas_diag(buffer_.diag_), + buffer_.m_, buffer_.n_, buffer_.alpha_, + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_copy + buffer_.offB_, buffer_.ldb_); + + cblas_daxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.b_, 1, buffer_.b_copy, 1); + double norm_error = cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/ + cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1); + printf("Error of clblas_dtrmm against cblas_dtrmm = %f \n", norm_error); +} + +template<> +void +xTrmm:: +validation() +{ + cblas_ctrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_), + clblasToCblas_fill(buffer_.uplo_), + clblasToCblas_operation(buffer_.trans_a_), + clblasToCblas_diag(buffer_.diag_), + buffer_.m_, buffer_.n_, &(buffer_.alpha_), + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_copy + buffer_.offB_, buffer_.ldb_); + + cl_float2 neg_one = makeScalar(-1.0); + cblas_caxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.b_, 1, buffer_.b_copy, 1); + float norm_error = cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/ + cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1); + printf("Error of clblas_ctrmm against cblas_ctrmm = %f \n", norm_error); +} + + +template<> +void +xTrmm:: +validation() +{ + cblas_ztrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_), + clblasToCblas_fill(buffer_.uplo_), + clblasToCblas_operation(buffer_.trans_a_), + clblasToCblas_diag(buffer_.diag_), + buffer_.m_, buffer_.n_, &(buffer_.alpha_), + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_copy + buffer_.offB_, buffer_.ldb_); + + cl_double2 neg_one = makeScalar(-1.0); + cblas_zaxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.b_, 1, buffer_.b_copy, 1); + double norm_error = cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/ + cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1); + printf("Error of clblas_ztrmm against cblas_ztrmm = %f \n", norm_error); +} + +#endif #endif // ifndef CLBLAS_BENCHMARK_XTRMM_HXX__ diff --git a/src/client/client.cpp b/src/client/client.cpp index d067c3db..ba9c5fc4 100644 --- a/src/client/client.cpp +++ b/src/client/client.cpp @@ -46,547 +46,552 @@ namespace po = boost::program_options; int main(int argc, char *argv[]) { - size_t M; - size_t N; - size_t K; - cl_double alpha; - cl_double beta; - cl_uint profileCount; - cl_uint apiCallCount; - cl_uint commandQueueFlags = 0; - cl_device_type deviceType = CL_DEVICE_TYPE_GPU; - int order_option; - //clblasOrder order; - //clblasTranspose transA; - //clblasTranspose transB; - int transA_option; - int transB_option; - size_t lda; - size_t ldb; - size_t ldc; - size_t offA; - size_t offBX; - size_t offCY; - std::string function; - std::string precision; - std::string roundtrip; - std::string memalloc; - int side_option; - int uplo_option; - int diag_option; - unsigned int numQueuesToUse; + size_t M; + size_t N; + size_t K; + cl_double alpha; + cl_double beta; + cl_uint profileCount; + cl_uint apiCallCount; + cl_uint commandQueueFlags = 0; + cl_device_type deviceType = CL_DEVICE_TYPE_GPU; + int order_option; + //clblasOrder order; + //clblasTranspose transA; + //clblasTranspose transB; + int transA_option; + int transB_option; + size_t lda; + size_t ldb; + size_t ldc; + size_t offA; + size_t offBX; + size_t offCY; + std::string function; + std::string precision; + std::string roundtrip; + std::string memalloc; + int side_option; + int uplo_option; + int diag_option; + unsigned int numQueuesToUse; + int validate; - po::options_description desc( "clBLAS client command line options" ); - desc.add_options() - ( "help,h", "produces this help message" ) - ( "gpu,g", "Force instantiation of an OpenCL GPU device" ) - ( "cpu,c", "Force instantiation of an OpenCL CPU device" ) - ( "all,a", "Force instantiation of all OpenCL devices" ) - ( "useimages", "Use an image-based kernel" ) - ( "sizem,m", po::value( &M )->default_value(128), "number of rows in A and C" ) - ( "sizen,n", po::value( &N )->default_value(128), "number of columns in B and C" ) - ( "sizek,k", po::value( &K )->default_value(128), "number of columns in A and rows in B" ) - ( "lda", po::value( &lda )->default_value(0), "first dimension of A in memory. if set to 0, lda will default to M (when transposeA is \"no transpose\") or K (otherwise)" ) - ( "ldb", po::value( &ldb )->default_value(0), "first dimension of B in memory. if set to 0, ldb will default to K (when transposeB is \"no transpose\") or N (otherwise)" ) - ( "ldc", po::value( &ldc )->default_value(0), "first dimension of C in memory. if set to 0, ldc will default to M" ) - ( "offA", po::value( &offA )->default_value(0), "offset of the matrix A in memory object" ) - ( "offBX", po::value( &offBX )->default_value(0), "offset of the matrix B or vector X in memory object" ) - ( "offCY", po::value( &offCY )->default_value(0), "offset of the matrix C or vector Y in memory object" ) - ( "alpha", po::value( &alpha )->default_value(1.0f), "specifies the scalar alpha" ) - ( "beta", po::value( &beta )->default_value(1.0f), "specifies the scalar beta" ) - ( "order,o", po::value( &order_option )->default_value(0), "0 = row major, 1 = column major" ) - ( "transposeA", po::value( &transA_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" ) - ( "transposeB", po::value( &transB_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" ) - ( "function,f", po::value( &function )->default_value("gemm"), "BLAS function to test. Options: gemm, trsm, trmm, gemv, symv, syrk, syr2k" ) - ( "precision,r", po::value( &precision )->default_value("s"), "Options: s,d,c,z" ) - ( "side", po::value( &side_option )->default_value(0), "0 = left, 1 = right. only used with [list of function families]" ) // xtrsm xtrmm - ( "uplo", po::value( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" ) // xsymv xsyrk xsyr2k xtrsm xtrmm - ( "diag", po::value( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm - ( "profile,p", po::value( &profileCount )->default_value(20), "Time and report the kernel speed (default: 20)" ) - ( "apiCallCount", po::value(&apiCallCount)->default_value(10), "Time and report the kernel speed on counds of API calls (default: 10)") - ( "numQueues", po::value(&numQueuesToUse)->default_value(1), "Number of cl_command_queues to use( default: 1)") - ( "roundtrip", po::value( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)") - ( "memalloc", po::value( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd,rect_mem") - ; + po::options_description desc( "clBLAS client command line options" ); + desc.add_options() + ( "help,h", "produces this help message" ) + ( "gpu,g", "Force instantiation of an OpenCL GPU device" ) + ( "cpu,c", "Force instantiation of an OpenCL CPU device" ) + ( "all,a", "Force instantiation of all OpenCL devices" ) + ( "useimages", "Use an image-based kernel" ) + ( "sizem,m", po::value( &M )->default_value(128), "number of rows in A and C" ) + ( "sizen,n", po::value( &N )->default_value(128), "number of columns in B and C" ) + ( "sizek,k", po::value( &K )->default_value(128), "number of columns in A and rows in B" ) + ( "lda", po::value( &lda )->default_value(0), "first dimension of A in memory. if set to 0, lda will default to M (when transposeA is \"no transpose\") or K (otherwise)" ) + ( "ldb", po::value( &ldb )->default_value(0), "first dimension of B in memory. if set to 0, ldb will default to K (when transposeB is \"no transpose\") or N (otherwise)" ) + ( "ldc", po::value( &ldc )->default_value(0), "first dimension of C in memory. if set to 0, ldc will default to M" ) + ( "offA", po::value( &offA )->default_value(0), "offset of the matrix A in memory object" ) + ( "offBX", po::value( &offBX )->default_value(0), "offset of the matrix B or vector X in memory object" ) + ( "offCY", po::value( &offCY )->default_value(0), "offset of the matrix C or vector Y in memory object" ) + ( "alpha", po::value( &alpha )->default_value(1.0f), "specifies the scalar alpha" ) + ( "beta", po::value( &beta )->default_value(1.0f), "specifies the scalar beta" ) + ( "order,o", po::value( &order_option )->default_value(1), "0 = row major, 1 = column major" ) + ( "transposeA", po::value( &transA_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" ) + ( "transposeB", po::value( &transB_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" ) + ( "function,f", po::value( &function )->default_value("gemm"), "BLAS function to test. Options: gemm, trsm, trmm, gemv, symv, syrk, syr2k" ) + ( "precision,r", po::value( &precision )->default_value("s"), "Options: s,d,c,z" ) + ( "side", po::value( &side_option )->default_value(0), "0 = left, 1 = right. only used with [list of function families]" ) // xtrsm xtrmm + ( "uplo", po::value( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" ) // xsymv xsyrk xsyr2k xtrsm xtrmm + ( "diag", po::value( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm + ( "profile,p", po::value( &profileCount )->default_value(20), "Time and report the kernel speed (default: 20)" ) + ( "apiCallCount", po::value(&apiCallCount)->default_value(10), "Time and report the kernel speed on counds of API calls (default: 10)") + ( "numQueues", po::value(&numQueuesToUse)->default_value(1), "Number of cl_command_queues to use( default: 1)") + ( "roundtrip", po::value( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)") + ( "memalloc", po::value( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd,rect_mem") + ( "validate,v", po::value(&validate)->default_value(0), "Validate GPU results with CPU BLAS? 0 = No, 1 = Yes (default: No): currently only available for gemm and trmm") + ; - po::variables_map vm; - po::store( po::parse_command_line( argc, argv, desc ), vm ); - po::notify( vm ); + po::variables_map vm; + po::store( po::parse_command_line( argc, argv, desc ), vm ); + po::notify( vm ); - if( vm.count( "help" ) ) - { - std::cout << desc << std::endl; - return 0; - } + if( vm.count( "help" ) ) + { + std::cout << desc << std::endl; + return 0; + } - if( function != "gemm" - && function != "trsm" - && function != "trmm" - && function != "gemv" - && function != "symv" - && function != "syrk" - && function != "syr2k" - && function != "trsv" - && function != "trmv" - && function != "ger" - && function != "syr" - && function != "syr2" - && function != "geru" - && function != "gerc" - && function != "her" - && function != "her2" - && function != "hemv" - && function != "hemm" - && function != "symm" - && function != "herk" - && function != "her2k" - ) - { - std::cerr << "Invalid value for --function" << std::endl; - return -1; - } + if( function != "gemm" + && function != "trsm" + && function != "trmm" + && function != "gemv" + && function != "symv" + && function != "syrk" + && function != "syr2k" + && function != "trsv" + && function != "trmv" + && function != "ger" + && function != "syr" + && function != "syr2" + && function != "geru" + && function != "gerc" + && function != "her" + && function != "her2" + && function != "hemv" + && function != "hemm" + && function != "symm" + && function != "herk" + && function != "her2k" + ) + { + std::cerr << "Invalid value for --function" << std::endl; + return -1; + } - if( precision != "s" && precision != "d" && precision != "c" && precision != "z" ) - { - std::cerr << "Invalid value for --precision" << std::endl; - return -1; - } + if( precision != "s" && precision != "d" && precision != "c" && precision != "z" ) + { + std::cerr << "Invalid value for --precision" << std::endl; + return -1; + } - size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0) - | ((vm.count( "cpu" ) > 0) ? 2 : 0) - | ((vm.count( "all" ) > 0) ? 4 : 0); - if((mutex & (mutex-1)) != 0) { - std::cerr << "You have selected mutually-exclusive OpenCL device options:" << std::endl; - if (vm.count ( "gpu" ) > 0) std::cerr << " gpu,g Force instantiation of an OpenCL GPU device" << std::endl; - if (vm.count ( "cpu" ) > 0) std::cerr << " cpu,c Force instantiation of an OpenCL CPU device" << std::endl; - if (vm.count ( "all" ) > 0) std::cerr << " all,a Force instantiation of all OpenCL devices" << std::endl; - return 1; - } + size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0) + | ((vm.count( "cpu" ) > 0) ? 2 : 0) + | ((vm.count( "all" ) > 0) ? 4 : 0); + if((mutex & (mutex-1)) != 0) { + std::cerr << "You have selected mutually-exclusive OpenCL device options:" << std::endl; + if (vm.count ( "gpu" ) > 0) std::cerr << " gpu,g Force instantiation of an OpenCL GPU device" << std::endl; + if (vm.count ( "cpu" ) > 0) std::cerr << " cpu,c Force instantiation of an OpenCL CPU device" << std::endl; + if (vm.count ( "all" ) > 0) std::cerr << " all,a Force instantiation of all OpenCL devices" << std::endl; + return 1; + } - if( vm.count( "gpu" ) ) - { - deviceType = CL_DEVICE_TYPE_GPU; - } + if( vm.count( "gpu" ) ) + { + deviceType = CL_DEVICE_TYPE_GPU; + } - if( vm.count( "cpu" ) ) - { - deviceType = CL_DEVICE_TYPE_CPU; - } + if( vm.count( "cpu" ) ) + { + deviceType = CL_DEVICE_TYPE_CPU; + } - if( vm.count( "all" ) ) - { - deviceType = CL_DEVICE_TYPE_ALL; - } + if( vm.count( "all" ) ) + { + deviceType = CL_DEVICE_TYPE_ALL; + } - if( profileCount >= 1 ) - { - commandQueueFlags |= CL_QUEUE_PROFILING_ENABLE; - } + if( profileCount >= 1 ) + { + commandQueueFlags |= CL_QUEUE_PROFILING_ENABLE; + } - bool useimages; - if( vm.count("useimages") ) - useimages = true; - else - useimages = false; + bool useimages; + if( vm.count("useimages") ) + useimages = true; + else + useimages = false; - StatisticalTimer& timer = StatisticalTimer::getInstance( ); - timer.Reserve( 3, profileCount ); - timer.setNormalize( true ); + StatisticalTimer& timer = StatisticalTimer::getInstance( ); + timer.Reserve( 3, profileCount ); + timer.setNormalize( true ); - clblasFunc *my_function = NULL; - if (function == "gemm") - { - if (precision == "s") - my_function = new xGemm(timer, deviceType, numQueuesToUse); - else if (precision == "d") - my_function = new xGemm(timer, deviceType, numQueuesToUse); - else if (precision == "c") - my_function = new xGemm(timer, deviceType, numQueuesToUse); - else if (precision == "z") - my_function = new xGemm(timer, deviceType, numQueuesToUse); - else - { - std::cerr << "Unknown gemm function" << std::endl; - return -1; - } - } - else if (function == "trsm") - { - if (precision == "s") - my_function = new xTrsm(timer, deviceType); - else if (precision == "d") - my_function = new xTrsm(timer, deviceType); - else if (precision == "c") - my_function = new xTrsm(timer, deviceType); - else if (precision == "z") - my_function = new xTrsm(timer, deviceType); - else - { - std::cerr << "Unknown trsm function" << std::endl; - return -1; - } - } - else if (function == "trmm") - { - if (precision == "s") - my_function = new xTrmm(timer, deviceType); - else if (precision == "d") - my_function = new xTrmm(timer, deviceType); - else if (precision == "c") - my_function = new xTrmm(timer, deviceType); - else if (precision == "z") - my_function = new xTrmm(timer, deviceType); - else + clblasFunc *my_function = NULL; + if (function == "gemm") { - std::cerr << "Unknown trmm function" << std::endl; - return -1; - } - } - else if (function == "gemv") - { - if (precision == "s") - my_function = new xGemv(timer, deviceType); - else if (precision == "d") - my_function = new xGemv(timer, deviceType); - else if (precision == "c") - my_function = new xGemv(timer, deviceType); - else if (precision == "z") - my_function = new xGemv(timer, deviceType); - else + if (precision == "s") + my_function = new xGemm(timer, deviceType, numQueuesToUse); + else if (precision == "d") + my_function = new xGemm(timer, deviceType, numQueuesToUse); + else if (precision == "c") + my_function = new xGemm(timer, deviceType, numQueuesToUse); + else if (precision == "z") + my_function = new xGemm(timer, deviceType, numQueuesToUse); + else + { + std::cerr << "Unknown gemm function" << std::endl; + return -1; + } + } + else if (function == "trsm") { - std::cerr << "Unknown gemv function" << std::endl; - return -1; + if (precision == "s") + my_function = new xTrsm(timer, deviceType); + else if (precision == "d") + my_function = new xTrsm(timer, deviceType); + else if (precision == "c") + my_function = new xTrsm(timer, deviceType); + else if (precision == "z") + my_function = new xTrsm(timer, deviceType); + else + { + std::cerr << "Unknown trsm function" << std::endl; + return -1; + } } - } - else if (function == "symv") - { - if (precision == "s") - my_function = new xSymv(timer, deviceType); - else if (precision == "d") - my_function = new xSymv(timer, deviceType); - else + else if (function == "trmm") { - std::cerr << "Unknown symv function" << std::endl; - return -1; + if (precision == "s") + my_function = new xTrmm(timer, deviceType); + else if (precision == "d") + my_function = new xTrmm(timer, deviceType); + else if (precision == "c") + my_function = new xTrmm(timer, deviceType); + else if (precision == "z") + my_function = new xTrmm(timer, deviceType); + else + { + std::cerr << "Unknown trmm function" << std::endl; + return -1; + } } - } - else if (function == "syrk") - { - if (precision == "s") - my_function = new xSyrk(timer, deviceType); - else if (precision == "d") - my_function = new xSyrk(timer, deviceType); + else if (function == "gemv") + { + if (precision == "s") + my_function = new xGemv(timer, deviceType); + else if (precision == "d") + my_function = new xGemv(timer, deviceType); else if (precision == "c") - my_function = new xSyrk(timer, deviceType); + my_function = new xGemv(timer, deviceType); else if (precision == "z") - my_function = new xSyrk(timer, deviceType); - else + my_function = new xGemv(timer, deviceType); + else + { + std::cerr << "Unknown gemv function" << std::endl; + return -1; + } + } + else if (function == "symv") { - std::cerr << "Unknown syrk function" << std::endl; - return -1; - } - } - else if (function == "syr2k") - { - if (precision == "s") - my_function = new xSyr2k(timer, deviceType); - else if (precision == "d") - my_function = new xSyr2k(timer, deviceType); - else if (precision == "c") - my_function = new xSyr2k(timer, deviceType); - else if (precision == "z") - my_function = new xSyr2k(timer, deviceType); - else + if (precision == "s") + my_function = new xSymv(timer, deviceType); + else if (precision == "d") + my_function = new xSymv(timer, deviceType); + else + { + std::cerr << "Unknown symv function" << std::endl; + return -1; + } + } + else if (function == "syrk") { - std::cerr << "Unknown syr2k function" << std::endl; - return -1; - } - } - else if (function == "trsv") - { - if (precision == "s") - my_function = new xTrsv(timer, deviceType); - else if (precision == "d") - my_function = new xTrsv(timer, deviceType); - else if (precision == "c") - my_function = new xTrsv(timer, deviceType); - else if (precision == "z") - my_function = new xTrsv(timer, deviceType); - else + if (precision == "s") + my_function = new xSyrk(timer, deviceType); + else if (precision == "d") + my_function = new xSyrk(timer, deviceType); + else if (precision == "c") + my_function = new xSyrk(timer, deviceType); + else if (precision == "z") + my_function = new xSyrk(timer, deviceType); + else + { + std::cerr << "Unknown syrk function" << std::endl; + return -1; + } + } + else if (function == "syr2k") { - std::cerr << "Unknown trsv function" << std::endl; - return -1; - } - } - else if (function == "trmv") - { - if (precision == "s") - my_function = new xTrmv(timer, deviceType); - else if (precision == "d") - my_function = new xTrmv(timer, deviceType); - else if (precision == "c") - my_function = new xTrmv(timer, deviceType); - else if (precision == "z") - my_function = new xTrmv(timer, deviceType); - else + if (precision == "s") + my_function = new xSyr2k(timer, deviceType); + else if (precision == "d") + my_function = new xSyr2k(timer, deviceType); + else if (precision == "c") + my_function = new xSyr2k(timer, deviceType); + else if (precision == "z") + my_function = new xSyr2k(timer, deviceType); + else + { + std::cerr << "Unknown syr2k function" << std::endl; + return -1; + } + } + else if (function == "trsv") + { + if (precision == "s") + my_function = new xTrsv(timer, deviceType); + else if (precision == "d") + my_function = new xTrsv(timer, deviceType); + else if (precision == "c") + my_function = new xTrsv(timer, deviceType); + else if (precision == "z") + my_function = new xTrsv(timer, deviceType); + else + { + std::cerr << "Unknown trsv function" << std::endl; + return -1; + } + } + else if (function == "trmv") { - std::cerr << "Unknown trmv function" << std::endl; - return -1; + if (precision == "s") + my_function = new xTrmv(timer, deviceType); + else if (precision == "d") + my_function = new xTrmv(timer, deviceType); + else if (precision == "c") + my_function = new xTrmv(timer, deviceType); + else if (precision == "z") + my_function = new xTrmv(timer, deviceType); + else + { + std::cerr << "Unknown trmv function" << std::endl; + return -1; + } } - } - else if (function == "ger") - { - if (precision == "s") - my_function = new xGer(timer, deviceType); - else if (precision == "d") - my_function = new xGer(timer, deviceType); - else + else if (function == "ger") { - std::cerr << "Unknown ger function" << std::endl; - return -1; + if (precision == "s") + my_function = new xGer(timer, deviceType); + else if (precision == "d") + my_function = new xGer(timer, deviceType); + else + { + std::cerr << "Unknown ger function" << std::endl; + return -1; + } } - } - else if (function == "syr") - { - if (precision == "s") - my_function = new xSyr(timer, deviceType); - else if (precision == "d") - my_function = new xSyr(timer, deviceType); - else + else if (function == "syr") { - std::cerr << "Unknown syr function" << std::endl; - return -1; + if (precision == "s") + my_function = new xSyr(timer, deviceType); + else if (precision == "d") + my_function = new xSyr(timer, deviceType); + else + { + std::cerr << "Unknown syr function" << std::endl; + return -1; + } } - } - else if (function == "syr2") - { - if (precision == "s") - my_function = new xSyr2(timer, deviceType); - else if (precision == "d") - my_function = new xSyr2(timer, deviceType); - else + else if (function == "syr2") { - std::cerr << "Unknown syr2 function" << std::endl; - return -1; + if (precision == "s") + my_function = new xSyr2(timer, deviceType); + else if (precision == "d") + my_function = new xSyr2(timer, deviceType); + else + { + std::cerr << "Unknown syr2 function" << std::endl; + return -1; + } } - } - else if (function == "geru") - { - if (precision == "c") - my_function = new xGeru(timer, deviceType); - else if (precision == "z") - my_function = new xGeru(timer, deviceType); - else + else if (function == "geru") { - std::cerr << "Unknown geru function" << std::endl; - return -1; + if (precision == "c") + my_function = new xGeru(timer, deviceType); + else if (precision == "z") + my_function = new xGeru(timer, deviceType); + else + { + std::cerr << "Unknown geru function" << std::endl; + return -1; + } } - } - else if (function == "gerc") - { - if (precision == "c") - my_function = new xGerc(timer, deviceType); - else if (precision == "z") - my_function = new xGerc(timer, deviceType); - else + else if (function == "gerc") { - std::cerr << "Unknown gerc function" << std::endl; - return -1; + if (precision == "c") + my_function = new xGerc(timer, deviceType); + else if (precision == "z") + my_function = new xGerc(timer, deviceType); + else + { + std::cerr << "Unknown gerc function" << std::endl; + return -1; + } } - } - else if (function == "her") - { - if (precision == "c") - my_function = new xHer(timer, deviceType); - else if (precision == "z") - my_function = new xHer(timer, deviceType); - else + else if (function == "her") { - std::cerr << "Unknown her function" << std::endl; - return -1; + if (precision == "c") + my_function = new xHer(timer, deviceType); + else if (precision == "z") + my_function = new xHer(timer, deviceType); + else + { + std::cerr << "Unknown her function" << std::endl; + return -1; + } } - } - else if (function == "her2") - { - if (precision == "c") - my_function = new xHer2(timer, deviceType); - else if (precision == "z") - my_function = new xHer2(timer, deviceType); - else + else if (function == "her2") { - std::cerr << "Unknown her2 function" << std::endl; - return -1; + if (precision == "c") + my_function = new xHer2(timer, deviceType); + else if (precision == "z") + my_function = new xHer2(timer, deviceType); + else + { + std::cerr << "Unknown her2 function" << std::endl; + return -1; + } } - } - else if (function == "hemv") - { - if (precision == "c") - my_function = new xHemv(timer, deviceType); - else if (precision == "z") - my_function = new xHemv(timer, deviceType); - else + else if (function == "hemv") { - std::cerr << "Unknown hemv function" << std::endl; - return -1; + if (precision == "c") + my_function = new xHemv(timer, deviceType); + else if (precision == "z") + my_function = new xHemv(timer, deviceType); + else + { + std::cerr << "Unknown hemv function" << std::endl; + return -1; + } } - } - else if (function == "hemm") - { - if (precision == "c") - my_function = new xHemm(timer, deviceType); - else if (precision == "z") - my_function = new xHemm(timer, deviceType); - else + else if (function == "hemm") { - std::cerr << "Unknown hemm function" << std::endl; - return -1; + if (precision == "c") + my_function = new xHemm(timer, deviceType); + else if (precision == "z") + my_function = new xHemm(timer, deviceType); + else + { + std::cerr << "Unknown hemm function" << std::endl; + return -1; + } } - } - else if (function == "herk") - { - if (precision == "c") - my_function = new xHerk(timer, deviceType); - else if (precision == "z") - my_function = new xHerk(timer, deviceType); - else + else if (function == "herk") { - std::cerr << "Unknown her function" << std::endl; - return -1; + if (precision == "c") + my_function = new xHerk(timer, deviceType); + else if (precision == "z") + my_function = new xHerk(timer, deviceType); + else + { + std::cerr << "Unknown her function" << std::endl; + return -1; + } } - } - else if (function == "her2k") - { - if (precision == "c") - my_function = new xHer2k(timer, deviceType); - else if (precision == "z") - my_function = new xHer2k(timer, deviceType); - else + else if (function == "her2k") { - std::cerr << "Unknown her2 function" << std::endl; - return -1; - } - } - else if (function == "symm") - { - if (precision == "s") - my_function = new xSymm(timer, deviceType); - else if (precision == "d") - my_function = new xSymm(timer, deviceType); - else if (precision == "c") - my_function = new xSymm(timer, deviceType); - else if (precision == "z") - my_function = new xSymm(timer, deviceType); - else + if (precision == "c") + my_function = new xHer2k(timer, deviceType); + else if (precision == "z") + my_function = new xHer2k(timer, deviceType); + else + { + std::cerr << "Unknown her2 function" << std::endl; + return -1; + } + } + else if (function == "symm") { - std::cerr << "Unknown symm function" << std::endl; - return -1; + if (precision == "s") + my_function = new xSymm(timer, deviceType); + else if (precision == "d") + my_function = new xSymm(timer, deviceType); + else if (precision == "c") + my_function = new xSymm(timer, deviceType); + else if (precision == "z") + my_function = new xSymm(timer, deviceType); + else + { + std::cerr << "Unknown symm function" << std::endl; + return -1; + } } - } - try - { - my_function->setup_buffer( order_option, side_option, uplo_option, - diag_option, transA_option, transB_option, + try + { + my_function->setup_buffer( order_option, side_option, uplo_option, + diag_option, transA_option, transB_option, M, N, K, lda, ldb, ldc, offA, offBX, offCY, alpha, beta ); - - my_function->initialize_cpu_buffer(); - my_function->initialize_gpu_buffer(); - my_function->setup_apiCallCount(apiCallCount); - my_function->call_func(); // do a calculation first to get any compilation out of the way - my_function->reset_gpu_write_buffer(); // reset GPU write buffer - } - catch( std::exception& exc ) - { - std::cerr << exc.what( ) << std::endl; - return 1; - } - if(roundtrip=="roundtrip"||roundtrip=="both") - { - timer.Reset(); - for( cl_uint i = 0; i < profileCount; ++i ) - { - my_function->roundtrip_setup_buffer( order_option, side_option, uplo_option, - diag_option, transA_option, transB_option, + my_function->initialize_cpu_buffer(); + my_function->initialize_gpu_buffer(); + my_function->setup_apiCallCount(apiCallCount); + my_function->call_func(); // do a calculation first to get any compilation out of the way + my_function->reset_gpu_write_buffer(); // reset GPU write buffer + } + catch( std::exception& exc ) + { + std::cerr << exc.what( ) << std::endl; + return 1; + } + if(roundtrip=="roundtrip"||roundtrip=="both") + { + timer.Reset(); + for( cl_uint i = 0; i < profileCount; ++i ) + { + my_function->roundtrip_setup_buffer( order_option, side_option, uplo_option, + diag_option, transA_option, transB_option, M, N, K, lda, ldb, ldc, offA, offBX, offCY, alpha, beta ); - my_function->initialize_cpu_buffer(); - /*my_function->initialize_gpu_buffer(); - my_function->call_func(); - my_function->read_gpu_buffer(); - my_function->reset_gpu_write_buffer();*/ - - if(memalloc=="default") - { - my_function->roundtrip_func(); - } - else if (memalloc=="alloc_host_ptr") - { - my_function->allochostptr_roundtrip_func(); - } - else if (memalloc=="use_host_ptr") - { - my_function->usehostptr_roundtrip_func(); - } - else if (memalloc=="copy_host_ptr") - { - my_function->copyhostptr_roundtrip_func(); - } - else if (memalloc=="use_persistent_mem_amd") - { - my_function->usepersismem_roundtrip_func(); - } - else if (memalloc=="rect_mem") - { - my_function->roundtrip_func_rect(); - } - //my_function->reset_gpu_write_buffer(); - my_function->releaseGPUBuffer_deleteCPUBuffer(); - } + my_function->initialize_cpu_buffer(); + /*my_function->initialize_gpu_buffer(); + my_function->call_func(); + my_function->read_gpu_buffer(); + my_function->reset_gpu_write_buffer();*/ + + if(memalloc=="default") + { + my_function->roundtrip_func(); + } + else if (memalloc=="alloc_host_ptr") + { + my_function->allochostptr_roundtrip_func(); + } + else if (memalloc=="use_host_ptr") + { + my_function->usehostptr_roundtrip_func(); + } + else if (memalloc=="copy_host_ptr") + { + my_function->copyhostptr_roundtrip_func(); + } + else if (memalloc=="use_persistent_mem_amd") + { + my_function->usepersismem_roundtrip_func(); + } + else if (memalloc=="rect_mem") + { + my_function->roundtrip_func_rect(); + } + //my_function->reset_gpu_write_buffer(); + my_function->releaseGPUBuffer_deleteCPUBuffer(); + } - if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE ) - { - //std::cout << timer << std::endl; - timer.pruneOutliers( 3.0 ); - std::cout << "BLAS (round trip) execution time < ns >: " << my_function->time_in_ns() << std::endl; - std::cout << "BLAS (round trip) execution Gflops < " << - my_function->gflops_formula() << " >: " << my_function->gflops() << - std::endl; - } - } - if(roundtrip=="noroundtrip"||roundtrip=="both") - { - timer.Reset(); - my_function->setup_buffer( order_option, side_option, uplo_option, - diag_option, transA_option, transB_option, + if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE ) + { + //std::cout << timer << std::endl; + timer.pruneOutliers( 3.0 ); + std::cout << "BLAS (round trip) execution time < ns >: " << my_function->time_in_ns() << std::endl; + std::cout << "BLAS (round trip) execution Gflops < " << + my_function->gflops_formula() << " >: " << my_function->gflops() << + std::endl; + } + } + if(roundtrip=="noroundtrip"||roundtrip=="both") + { + timer.Reset(); + my_function->setup_buffer( order_option, side_option, uplo_option, + diag_option, transA_option, transB_option, M, N, K, lda, ldb, ldc, offA, offBX, offCY, alpha, beta ); - my_function->initialize_cpu_buffer(); - my_function->initialize_gpu_buffer(); - my_function->setup_apiCallCount( apiCallCount ); + my_function->initialize_cpu_buffer(); + my_function->initialize_gpu_buffer(); + my_function->setup_apiCallCount( apiCallCount ); + + for (cl_uint i = 0; i < profileCount; ++i) - { - my_function->call_func(); - } - my_function->read_gpu_buffer(); - //my_function->reset_gpu_write_buffer(); - my_function->releaseGPUBuffer_deleteCPUBuffer(); + { + my_function->call_func(); + } + my_function->read_gpu_buffer(); - if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE ) - { - //std::cout << timer << std::endl; - timer.pruneOutliers( 3.0 ); - std::cout << "BLAS kernel execution time < ns >: " << my_function->time_in_ns() / apiCallCount << std::endl; - std::cout << "BLAS kernel execution Gflops < " << - my_function->gflops_formula() << " >: " << my_function->gflops() << - std::endl; - } - } - delete my_function; - return 0; -} + my_function->validate_with_cblas(validate); + + //my_function->reset_gpu_write_buffer(); + my_function->releaseGPUBuffer_deleteCPUBuffer(); + if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE ) + { + //std::cout << timer << std::endl; + timer.pruneOutliers( 3.0 ); + std::cout << "BLAS kernel execution time < ns >: " << my_function->time_in_ns() / apiCallCount << std::endl; + std::cout << "BLAS kernel execution Gflops < " << + my_function->gflops_formula() << " >: " << my_function->gflops() << + std::endl; + } + } + delete my_function; + return 0; +} From 80288680f069b93cb612e1e473b0dd06712f21f9 Mon Sep 17 00:00:00 2001 From: Abhishek Shandilya Date: Sat, 2 Jul 2016 00:26:02 +0530 Subject: [PATCH 37/45] fix #265 - spelling errors in comments and print statements (#276) * fix #265 - spelling errors in comments and print statements * correct the argument order in doxygen comments --- src/clBLAS.h | 2 +- src/include/kerngen.h | 2 +- src/library/blas/generic/solution_seq_make.c | 2 +- src/library/blas/gens/clTemplates/gemm.cl | 8 ++--- src/library/blas/gens/clTemplates/symm.cl | 36 ++++++++++---------- src/library/blas/gens/clTemplates/trmv.cl | 4 +-- src/library/blas/gens/kprintf.cpp | 4 +-- src/library/blas/gens/trsv_gemv.cpp | 2 +- src/library/blas/include/kprintf.hpp | 2 +- src/library/blas/xtrsv.c | 4 +-- src/library/common/kerngen_core.c | 2 +- 11 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/clBLAS.h b/src/clBLAS.h index fef80a42..13ac6e9b 100644 --- a/src/clBLAS.h +++ b/src/clBLAS.h @@ -7809,9 +7809,9 @@ clblasDtrmm( * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. + * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. - * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasStrmm(). * @param[out] B Buffer object storing matrix \b B. diff --git a/src/include/kerngen.h b/src/include/kerngen.h index 73ee1912..fc39a9c2 100644 --- a/src/include/kerngen.h +++ b/src/include/kerngen.h @@ -244,7 +244,7 @@ kgenSyncFormatting( * * @return 0 on success; -1 if the source code exceeds the buffer, * or level of the code nesting is not zero, or the returned - * type is not defined, or there is not a paranthesis opening + * type is not defined, or there is not a parenthesis opening * the argument list */ int diff --git a/src/library/blas/generic/solution_seq_make.c b/src/library/blas/generic/solution_seq_make.c index ab64869a..758bc78d 100644 --- a/src/library/blas/generic/solution_seq_make.c +++ b/src/library/blas/generic/solution_seq_make.c @@ -214,7 +214,7 @@ selectVectorization( } // - // Routines that dont use LDS have to be below the isLdsUsed() code + // Routines that don't use LDS have to be below the isLdsUsed() code // if (step->funcID == CLBLAS_GEMM2) { diff --git a/src/library/blas/gens/clTemplates/gemm.cl b/src/library/blas/gens/clTemplates/gemm.cl index 26f05267..70fe092d 100644 --- a/src/library/blas/gens/clTemplates/gemm.cl +++ b/src/library/blas/gens/clTemplates/gemm.cl @@ -98,7 +98,7 @@ __kernel void GEMM_NN__KERNEL ( __global %TYPE const * restrict _A, __global %TY // %V - Vectoring Width // %PANEL(*) - Panel Width to access Rows of A and Columns of B // Right now, %V is assumed to be the panel width. - // We dont use %PANEL in the current implementation. + // We don't use %PANEL in the current implementation. // MV = M; #ifndef TAIL_RUN @@ -118,7 +118,7 @@ __kernel void GEMM_NN__KERNEL ( __global %TYPE const * restrict _A, __global %TY bidX = ( get_group_id(0) / ( blockDimY)); // // Note: - // Using the new Map function does not yeild any performnce gain. + // Using the new Map function does not yield any performnce gain. // In fact, it degraded the performance // Keep this commented. // @@ -613,7 +613,7 @@ __kernel void GEMM_NT__KERNEL ( __global %TYPE const * restrict _A, __global %TY // %V - Vectoring Width // %PANEL(*) - Panel Width to access Rows of A and Columns of B // Right now, %V is assumed to be the panel width. - // We dont use %PANEL in the current implementation. + // We don't use %PANEL in the current implementation. // MV = M; NV = N; @@ -1141,7 +1141,7 @@ __kernel void GEMM_TN__KERNEL ( __global %TYPE const * restrict _A, __global %TY // %V - Vectoring Width // %PANEL(*) - Panel Width to access Rows of A and Columns of B // Right now, %V is assumed to be the panel width. - // We dont use %PANEL in the current implementation. + // We don't use %PANEL in the current implementation. // MV = M; #ifndef TAIL_RUN diff --git a/src/library/blas/gens/clTemplates/symm.cl b/src/library/blas/gens/clTemplates/symm.cl index 597fa8bb..75849182 100644 --- a/src/library/blas/gens/clTemplates/symm.cl +++ b/src/library/blas/gens/clTemplates/symm.cl @@ -176,18 +176,18 @@ const char *SYMM_C_KERNEL= " return SYMM_VECTOR_LOAD(A, M, lda, row, col); } #ifdef __SYMM_LOWER__ - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADA_FIRST(A,M,K,lda,row,col) %VLOAD(0, (&A[(col)*lda + (row)])) #elif defined(__SYMM_UPPER__) - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADA_FIRST(A,M,K,lda,row,col) %VLOADWITHINCXV2(0, (&A[(row)*lda + (col)]), lda) #endif #define LOADA_SECOND(A,M,K,lda,row,col) SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, row, col) #ifdef __SYMM_LOWER__ - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADA_THIRD(A,M,K,lda,row, col) %VLOADWITHINCXV2(0, (&A[(row)*lda + (col)]), lda) #elif defined(__SYMM_UPPER__) - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADA_THIRD(A,M,K,lda,row, col) %VLOAD(0, (&A[(col)*lda + (row)])) #endif #define LOADA_TAIL(A,M,K,lda,row,col) SYMM_VECTOR_LOAD_USING_SCALAR(A,M,lda,row,col) @@ -217,18 +217,18 @@ const char *SYMM_C_KERNEL= " return SYMM_VECTOR_LOAD(B, N, ldb, row, col); } #ifdef __SYMM_UPPER__ - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADB_FIRST(B,K,N,ldb,row,col) %VLOAD(0, (&B[(col)*(ldb) + (row)])) #elif defined(__SYMM_LOWER__) - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADB_FIRST(B,K,N,ldb,row,col) %VLOADWITHINCXV2(0, (&B[(row)*(ldb) + (col)]), ldb) #endif #define LOADB_SECOND(B,K,N,ldb,row,col) SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, row, col) #ifdef __SYMM_UPPER__ - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADB_THIRD(B,K,N,ldb,row,col) %VLOADWITHINCXV2(0, (&B[(row)*(ldb) + (col)]), ldb) #elif defined(__SYMM_LOWER__) - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADB_THIRD(B,K,N,ldb,row,col) %VLOAD(0, (&B[(col)*(ldb) + (row)])) #endif #define LOADB_TAIL(B,K,N,ldb,row,col) SYMM_VECTOR_LOAD_USING_SCALAR(B, N,ldb,row,col) @@ -288,7 +288,7 @@ const char *SYMM_C_KERNEL= " // %V - Vectoring Width // %PANEL(*) - Panel Width to access Rows of A and Columns of B // Right now, %V is assumed to be the panel width. - // We dont use %PANEL in the current implementation. + // We don't use %PANEL in the current implementation. // blockDimY = ((M-1) / (threadsY * %ITEMY)) + 1; bidY = ( get_group_id(0) % ( blockDimY)); @@ -673,18 +673,18 @@ const char *SYMM_C_KERNEL_WORKING_EXCEPT_CSYMM_PROBLEM = " return SYMM_VECTOR_LOAD(A, M, lda, row, col); } #ifdef __SYMM_LOWER__ - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADA_FIRST(A,M,K,lda,row,col) %VLOAD(0, (&A[(col)*lda + (row)])) #elif defined(__SYMM_UPPER__) - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADA_FIRST(A,M,K,lda,row,col) %VLOADWITHINCXV2(0, (&A[(row)*lda + (col)]), lda) #endif #define LOADA_SECOND(A,M,K,lda,row,col) SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, row, col) #ifdef __SYMM_LOWER__ - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADA_THIRD(A,M,K,lda,row, col) %VLOADWITHINCXV2(0, (&A[(row)*lda + (col)]), lda) #elif defined(__SYMM_UPPER__) - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADA_THIRD(A,M,K,lda,row, col) %VLOAD(0, (&A[(col)*lda + (row)])) #endif #define LOADA_TAIL(A,M,K,lda,row,col) SYMM_VECTOR_LOAD_USING_SCALAR(A,M,lda,row,col) @@ -714,18 +714,18 @@ const char *SYMM_C_KERNEL_WORKING_EXCEPT_CSYMM_PROBLEM = " return SYMM_VECTOR_LOAD(B, N, ldb, row, col); } #ifdef __SYMM_UPPER__ - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADB_FIRST(B,K,N,ldb,row,col) %VLOAD(0, (&B[(col)*(ldb) + (row)])) #elif defined(__SYMM_LOWER__) - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADB_FIRST(B,K,N,ldb,row,col) %VLOADWITHINCXV2(0, (&B[(row)*(ldb) + (col)]), ldb) #endif #define LOADB_SECOND(B,K,N,ldb,row,col) SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, row, col) #ifdef __SYMM_UPPER__ - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADB_THIRD(B,K,N,ldb,row,col) %VLOADWITHINCXV2(0, (&B[(row)*(ldb) + (col)]), ldb) #elif defined(__SYMM_LOWER__) - // CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller + // CHECK: KPRINTF Behaviour with so many parentheses - If fails, use parentheses in the caller #define LOADB_THIRD(B,K,N,ldb,row,col) %VLOAD(0, (&B[(col)*(ldb) + (row)])) #endif #define LOADB_TAIL(B,K,N,ldb,row,col) SYMM_VECTOR_LOAD_USING_SCALAR(B, N,ldb,row,col) @@ -783,7 +783,7 @@ const char *SYMM_C_KERNEL_WORKING_EXCEPT_CSYMM_PROBLEM = " // %V - Vectoring Width // %PANEL(*) - Panel Width to access Rows of A and Columns of B // Right now, %V is assumed to be the panel width. - // We dont use %PANEL in the current implementation. + // We don't use %PANEL in the current implementation. // blockDimY = ((M-1) / (threadsY * %ITEMY)) + 1; bidY = ( get_group_id(0) % ( blockDimY)); diff --git a/src/library/blas/gens/clTemplates/trmv.cl b/src/library/blas/gens/clTemplates/trmv.cl index 21af6ce5..807ae69a 100644 --- a/src/library/blas/gens/clTemplates/trmv.cl +++ b/src/library/blas/gens/clTemplates/trmv.cl @@ -170,7 +170,7 @@ __kernel void %PREFIXtrmv_CU_kernel( __global %TYPE const* restrict _A, __global // // Only TARGET_WIDTH threads points are to be read from X-vector - // We dont't use VLOAD here because incx could be > 1 + // We don't use VLOAD here because incx could be > 1 // Minimal prototyping shows that having separate loading code // for incx value of 1 does not change anything in performance // In fact, the extra IF costs us. @@ -421,7 +421,7 @@ __kernel void %PREFIXtrmv_CL_kernel( __global %TYPE const* restrict _A, __global // // Only TARGET_WIDTH threads points are to be read from X-vector - // We dont't use VLOAD here because incx could be > 1 + // We don't use VLOAD here because incx could be > 1 // Minimal prototyping shows that having separate loading code // for incx value of 1 does not change anything in performance // In fact, the extra IF costs us. diff --git a/src/library/blas/gens/kprintf.cpp b/src/library/blas/gens/kprintf.cpp index d5cbecb8..35e448cb 100644 --- a/src/library/blas/gens/kprintf.cpp +++ b/src/library/blas/gens/kprintf.cpp @@ -854,7 +854,7 @@ void kprintf::handleComplexJoin(char **_src, char **_dst) { case SCALAR: // - // Dont do a thing...ComplexJoin not applicable for Real numbers + // Don't do a thing...ComplexJoin not applicable for Real numbers // break; @@ -1547,7 +1547,7 @@ void kprintf::handleReduceSumReal(char **_src, char **_dst, int vlength) if (!vlength) //Can happen for SCALAR cases where source code contains this within COMPLEX define { // - // Dont generate a thing. + // Don't generate a thing. // The src pointer has already been advanced to next line // Just move on.. // diff --git a/src/library/blas/gens/trsv_gemv.cpp b/src/library/blas/gens/trsv_gemv.cpp index 65121a1c..0d1b41d0 100644 --- a/src/library/blas/gens/trsv_gemv.cpp +++ b/src/library/blas/gens/trsv_gemv.cpp @@ -278,7 +278,7 @@ static bool isTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen /* * NOTE: * No-Transpose case - The code iterates along the X direction. Vectoring is along Y Direction. - * Since we dont iterate on Y direction (triangle height), this fixes the "blocky" component of the blocksize. + * Since we don't iterate on Y direction (triangle height), this fixes the "blocky" component of the blocksize. * The blockSize then determines how much width the block has on X direction and thus the number of loops * can be calculated from that information. */ diff --git a/src/library/blas/include/kprintf.hpp b/src/library/blas/include/kprintf.hpp index e2eb366a..12cdfc4d 100644 --- a/src/library/blas/include/kprintf.hpp +++ b/src/library/blas/include/kprintf.hpp @@ -77,7 +77,7 @@ class kprintf { void registerType(const char *baseType, int vecWidth, int internalVecWidth=1); void registerReducedTypes( const char* in, int div); void registerSuperTypes( const char* in, int mul); - char* mystrtok( char* in, const char* tok); //NOTE: strtok overwrites the string. we dont like that... + char* mystrtok( char* in, const char* tok); //NOTE: strtok overwrites the string. we don't like that... // // VLOAD %TYPE%V from (%PTYPE*) kind of memory locations // The Kernel writers should use "%TYPE" and "%TYPE%V" for kernel aguments, local variables etc.. diff --git a/src/library/blas/xtrsv.c b/src/library/blas/xtrsv.c index c75dadb6..13573b34 100644 --- a/src/library/blas/xtrsv.c +++ b/src/library/blas/xtrsv.c @@ -60,7 +60,7 @@ orchestrateNonTransposeTRSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *ge if ((trtri->subdims->y) != (gemv->subdims->y)) { - printf("TRSV: WARNING: TRTRI and GEMV dont have identical sub-divisions!!! %lu and %lu\n", trtri->subdims->y, gemv->subdims->y); + printf("TRSV: WARNING: TRTRI and GEMV don't have identical sub-divisions!!! %lu and %lu\n", trtri->subdims->y, gemv->subdims->y); return clblasNotImplemented; } else { #ifdef DEBUG_TRSV @@ -166,7 +166,7 @@ orchestrateTransposeTRSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gemvS if ((trtri->subdims->y) != (gemv->subdims->y)) { - printf("TRSV: Transpose: WARNING: TRTRI and GEMV dont have identical sub-divisions!!! %lu and %lu\n", trtri->subdims->y, gemv->subdims->y); + printf("TRSV: Transpose: WARNING: TRTRI and GEMV don't have identical sub-divisions!!! %lu and %lu\n", trtri->subdims->y, gemv->subdims->y); return clblasNotImplemented; } else { #ifdef DEBUG_TRSV diff --git a/src/library/common/kerngen_core.c b/src/library/common/kerngen_core.c index 7db25b60..2d467fff 100644 --- a/src/library/common/kerngen_core.c +++ b/src/library/common/kerngen_core.c @@ -110,7 +110,7 @@ searchFuncName(const char *source, size_t *len) char *name = NULL; /* - * Search the opening paranthesis. The word before it is + * Search the opening parenthesis. The word before it is * the function name */ sep = strchr(source, '('); From c464ab973cbb6747337a09fdf11672cfc7a9340e Mon Sep 17 00:00:00 2001 From: Ivan Vergiliev Date: Tue, 2 Aug 2016 12:53:38 +0300 Subject: [PATCH 38/45] Disable clang error on narrowing conversions. --- src/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 73ba594e..0ef7850a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -319,6 +319,8 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(TARGET_PLATFORM EQUAL 32) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-builtin") endif() +elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") elseif( MSVC ) # CMake sets huge stack frames for windows, for whatever reason. We go with compiler default. string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}" ) From 1775a50876eb62f4b86ebceb233ed0ddda0d455b Mon Sep 17 00:00:00 2001 From: Ivan Vergiliev Date: Tue, 2 Aug 2016 10:53:54 +0300 Subject: [PATCH 39/45] Point the CONTRIBUTING wiki links to the correct repository --- CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0dc5c7e8..195b009e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,9 +19,9 @@ We want to ensure that the project code base maintains a level of quality over t guidelines over time * separate check-ins that modify a files style from the ones that add/change/delete code. * target the **develop** branch in the repository -* ensure that the [code properly builds]( https://github.com/kknox/clBLAS/wiki/Build ) +* ensure that the [code properly builds]( https://github.com/clMathLibraries/clBLAS/wiki/Build ) * cannot break existing test cases - * we encourage contributors to [run the test-short]( https://github.com/kknox/clBLAS/wiki/Testing ) suite of tests on their end before the pull-request + * we encourage contributors to [run the test-short]( https://github.com/clMathLibraries/clBLAS/wiki/Testing ) suite of tests on their end before the pull-request * if possible, upload the test results associated with the pull request to a personal [gist repository]( https://gist.github.com/ ) and insert a link to the test results in the pull request so that collaborators can browse the results * if no test results are provided with the pull request, official collaborators will run the test suite on their test machines against the patch before we will accept the pull-request * if we detect failing test cases, we will request that the code associated with the pull request be fixed before the pull request will be merged From 0a8a4fa4bb048df7041996f236550768ea387bd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Carretero?= Date: Sun, 24 Jul 2016 14:31:36 -0400 Subject: [PATCH 40/45] add missing dependency to pthread (using rwlock functions) --- src/CMakeLists.txt | 1 + src/library/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 73ba594e..ce2dd198 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -172,6 +172,7 @@ endif( ) # add the math library for Linux if( UNIX ) set(MATH_LIBRARY "m") + set(THREAD_LIBRARY "pthread") endif() # set the path to specific OpenCL compiler diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt index 774f05b6..51648985 100644 --- a/src/library/CMakeLists.txt +++ b/src/library/CMakeLists.txt @@ -887,7 +887,7 @@ endif() set_target_properties(clBLAS PROPERTIES VERSION ${clBLAS_VERSION}) set_target_properties(clBLAS PROPERTIES SOVERSION ${clBLAS_SOVERSION}) set_target_properties( clBLAS PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) -target_link_libraries(clBLAS ${OPENCL_LIBRARIES} ${MATH_LIBRARY}) +target_link_libraries(clBLAS ${OPENCL_LIBRARIES} ${MATH_LIBRARY} ${THREAD_LIBRARY}) # CPack configuration; include the executable into the package install( TARGETS clBLAS From 0e0c95c2b686d895118c0a7f3e4ca2cfd2c5d422 Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Fri, 7 Oct 2016 16:49:55 -0400 Subject: [PATCH 41/45] x offset stored in offb, not offa, determines vectorization --- src/library/blas/gens/iamax.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/library/blas/gens/iamax.cpp b/src/library/blas/gens/iamax.cpp index 13928a8c..0062943d 100644 --- a/src/library/blas/gens/iamax.cpp +++ b/src/library/blas/gens/iamax.cpp @@ -108,7 +108,7 @@ selectVectorization( KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; - if( (((kargs->offa) % vlen) != 0)) + if( (((kargs->offb) % vlen) != 0)) { kflags = KEXTRA_NO_COPY_VEC_A; } From 53d25ef60399e95783f1e07ec73961c014f18efb Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Fri, 7 Oct 2016 16:58:15 -0400 Subject: [PATCH 42/45] syr2: Y uses incy, not incx --- src/library/blas/gens/clTemplates/her2.cl | 8 ++++---- src/library/blas/gens/clTemplates/syr2.cl | 6 +++--- src/library/blas/gens/clTemplates/syr2_her2.cl | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/library/blas/gens/clTemplates/her2.cl b/src/library/blas/gens/clTemplates/her2.cl index ae8f92df..a0520987 100644 --- a/src/library/blas/gens/clTemplates/her2.cl +++ b/src/library/blas/gens/clTemplates/her2.cl @@ -138,8 +138,8 @@ __kernel void %PREFIXher2_CL_kernel( __global const %TYPE* _A, __global const %T %CONJUGATE(1, res1); #endif %MUL( res5, alpha, res1 ); - res1 = Y[c * incx]; - res3 = Y[r * incx]; + res1 = Y[c * incy]; + res3 = Y[r * incy]; #ifndef HER2_ROWMAJOR %CONJUGATE(1, res1); #endif @@ -461,8 +461,8 @@ __kernel void %PREFIXher2_CU_kernel( __global const %TYPE* _A, __global const %T %CONJUGATE(1, res1); #endif %MUL( res5, alpha, res1 ); - res1 = Y[c * incx]; - res3 = Y[r * incx]; + res1 = Y[c * incy]; + res3 = Y[r * incy]; #ifndef HER2_ROWMAJOR %CONJUGATE(1, res1); #endif diff --git a/src/library/blas/gens/clTemplates/syr2.cl b/src/library/blas/gens/clTemplates/syr2.cl index 0846b2d8..4f86b2a3 100644 --- a/src/library/blas/gens/clTemplates/syr2.cl +++ b/src/library/blas/gens/clTemplates/syr2.cl @@ -128,9 +128,9 @@ __kernel void %PREFIXsyr2_CL_kernel( __global const %TYPE* _A, __global const %T %TYPE res1, res2; res1 = alpha * X[c * incx]; - res2 = alpha * X[r * incx]; - res1 = res1 * Y[r * incx]; - res2 = res2 * Y[c * incx]; + res2 = alpha * X[r * incx]; + res1 = res1 * Y[r * incy]; + res2 = res2 * Y[c * incy]; A[r + c * lda] += (res1 + res2); } diff --git a/src/library/blas/gens/clTemplates/syr2_her2.cl b/src/library/blas/gens/clTemplates/syr2_her2.cl index b335e8cd..fcb32801 100644 --- a/src/library/blas/gens/clTemplates/syr2_her2.cl +++ b/src/library/blas/gens/clTemplates/syr2_her2.cl @@ -148,8 +148,8 @@ __kernel void %PREFIXsyr2_her2_CL_kernel( __global %TYPE* _A, __global const %TY #endif #endif %MUL( res5, alpha, res1 ); - res1 = Y[c * incx]; - res3 = Y[r * incx]; + res1 = Y[c * incy]; + res3 = Y[r * incy]; #ifdef HER2_ONLY #ifndef HER2_ROWMAJOR %CONJUGATE(1, res1); @@ -514,8 +514,8 @@ __kernel void %PREFIXsyr2_her2_CU_kernel( __global %TYPE* _A, __global const %TY #endif #endif %MUL( res5, alpha, res1 ); - res1 = Y[c * incx]; - res3 = Y[r * incx]; + res1 = Y[c * incy]; + res3 = Y[r * incy]; #ifdef HER2_ONLY #ifndef HER2_ROWMAJOR %CONJUGATE(1, res1); From 69d38d947229e57668be51a40f9b3cd755d7320f Mon Sep 17 00:00:00 2001 From: Kent Knox Date: Mon, 16 Jan 2017 12:11:29 -0600 Subject: [PATCH 43/45] Adding additional trsm samples --- src/samples/CMakeLists.txt | 34 +++--- src/samples/example_ctrsm.c | 177 ++++++++++++++++++++++++++++++++ src/samples/example_strsm.cpp | 188 ++++++++++++++++++++++++++++++++++ src/tests/BlasBase.cpp | 5 +- src/tests/cmdline.c | 3 +- 5 files changed, 392 insertions(+), 15 deletions(-) create mode 100644 src/samples/example_ctrsm.c create mode 100644 src/samples/example_strsm.cpp diff --git a/src/samples/CMakeLists.txt b/src/samples/CMakeLists.txt index 8422e654..53ed2fb8 100644 --- a/src/samples/CMakeLists.txt +++ b/src/samples/CMakeLists.txt @@ -1,12 +1,12 @@ # ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,6 +19,8 @@ set(SSYMV_SAMPLE_SRC example_ssymv.c) set(SGEMM_SAMPLE_SRC example_sgemm.c) set(STRMM_SAMPLE_SRC example_strmm.c) set(STRSM_SAMPLE_SRC example_strsm.c) +set(STRSM_SAMPLE_SRCPP example_strsm.cpp) +set(CTRSM_SAMPLE_SRC example_ctrsm.c) set(SSYRK_SAMPLE_SRC example_ssyrk.c) set(SSYR2K_SAMPLE_SRC example_ssyr2k.c) @@ -91,6 +93,14 @@ add_executable(example_strsm ${STRSM_SAMPLE_SRC}) target_link_libraries(example_strsm ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_strsm PROPERTY FOLDER "Samples") +add_executable(example_strsm_cpp ${STRSM_SAMPLE_SRCPP}) +target_link_libraries(example_strsm_cpp ${OPENCL_LIBRARIES} clBLAS) +set_property( TARGET example_strsm_cpp PROPERTY FOLDER "Samples") + +add_executable(example_ctrsm ${CTRSM_SAMPLE_SRC}) +target_link_libraries(example_ctrsm ${OPENCL_LIBRARIES} clBLAS) +set_property( TARGET example_ctrsm PROPERTY FOLDER "Samples") + add_executable(example_ssyrk ${SSYRK_SAMPLE_SRC}) target_link_libraries(example_ssyrk ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_ssyrk PROPERTY FOLDER "Samples") @@ -267,11 +277,11 @@ else( ) set( CLBLAS_EXAMPLE_INSTALL_DESTINATION share/clBLAS/samples) endif() install( TARGETS example_sgemm example_sgemv example_ssymv example_ssyrk - example_ssyr2k example_strmm example_strsm - example_strmv example_strsv example_sger example_cher example_ssyr + example_ssyr2k example_strmm example_strsm + example_strmv example_strsv example_sger example_cher example_ssyr example_ssyr2 example_cherk example_ssymm example_chemm example_stpmv example_chpmv example_stpsv example_sspmv example_sspr example_chpr - example_sspr2 example_zhpr2 + example_sspr2 example_zhpr2 example_sgbmv example_stbmv example_ssbmv example_chbmv example_stbsv example_cher2k example_sswap example_sscal example_csscal example_scopy example_saxpy example_sdot @@ -286,16 +296,16 @@ install( TARGETS example_sgemm example_sgemv example_ssymv example_ssyrk configure_file( "${PROJECT_SOURCE_DIR}/samples/CMakeLists.pack" "${PROJECT_BINARY_DIR}/samples/CMakeLists.txt" COPYONLY ) - + if( WIN32 ) set( CLBLAS_SAMPLE_INSTALL_DESTINATION samples) else( ) set( CLBLAS_SAMPLE_INSTALL_DESTINATION share/clBLAS/samples/src) endif() - + install(FILES example_sgemv.c - example_ssymv.c + example_ssymv.c example_sgemm.c example_strmm.c example_strsm.c @@ -303,11 +313,11 @@ install(FILES example_ssyr2k.c example_strmv.c example_strsv.c - example_sger.c - example_ssyr.c + example_sger.c + example_ssyr.c example_ssyr2.c example_ssymm.c - example_cher.c + example_cher.c example_chemm.cpp example_cherk.cpp example_ssymm.c diff --git a/src/samples/example_ctrsm.c b/src/samples/example_ctrsm.c new file mode 100644 index 00000000..44664810 --- /dev/null +++ b/src/samples/example_ctrsm.c @@ -0,0 +1,177 @@ +/* ************************************************************************ + * Copyright 2013 Advanced Micro Devices, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ************************************************************************/ + + +#include +#include +#include + +/* Include CLBLAS header. It automatically includes needed OpenCL header, + * so we can drop out explicit inclusion of cl.h header. + */ +#include + +/* This example uses predefined matrices and their characteristics for + * simplicity purpose. + */ +static const clblasOrder order = clblasRowMajor; +static const clblasSide side = clblasLeft; + +static const size_t M = 4; +static const size_t N = 5; + +static const FloatComplex alpha = { 10, 0 }; + +static const clblasTranspose transA = clblasNoTrans; +static const clblasUplo uploA = clblasUpper; +static const clblasDiag diagA = clblasNonUnit; +static const FloatComplex A[] = { + { 11, 0 },{ 12, 0 },{ 13, 0 },{ 14, 0 }, + { 0, 0 },{ 22, 0 },{ 23, 0 },{ 24, 0 }, + { 0, 0 },{ 0, 0 },{ 33, 0 },{ 34, 0 }, + { 0, 0 },{ 0, 0 },{ 0, 0 },{ 44, 0 } +}; +static const size_t lda = 4; /* i.e. lda = M */ + +static FloatComplex B[] = { + { 11, 0 },{ 12, 0 },{ 13, 0 },{ 14, 0 },{ 15, 0 }, + { 21, 0 },{ 22, 0 },{ 23, 0 },{ 24, 0 },{ 25, 0 }, + { 31, 0 },{ 32, 0 },{ 33, 0 },{ 34, 0 },{ 35, 0 }, + { 41, 0 },{ 42, 0 },{ 43, 0 },{ 44, 0 },{ 45, 0 }, +}; +static const size_t ldb = 5; /* i.e. ldb = N */ + + +static FloatComplex result[20]; /* ldb*M */ + +static const size_t off = 1; +static const size_t offA = 4 + 1; /* M + off */ +static const size_t offB = 5 + 1; /* N + off */ + +static void +printResult(const char* str) +{ + size_t i, j, nrows; + + printf("%s:\n", str); + + nrows = (sizeof(result) / sizeof(FloatComplex)) / ldb; + for (i = 0; i < nrows; i++) { + for (j = 0; j < ldb; j++) { + printf("%.5f ", result[i * ldb + j].x); + } + printf("\n"); + } +} + +int +main(void) +{ + cl_int err; + cl_platform_id platform[] = { 0, 0 }; + cl_device_id device = 0; + cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; + cl_context ctx = 0; + cl_command_queue queue = 0; + cl_mem bufA, bufB; + cl_event event = NULL; + int ret = 0; + + /* Setup OpenCL environment. */ + err = clGetPlatformIDs(sizeof( platform ), &platform, NULL); + if (err != CL_SUCCESS) { + printf( "clGetPlatformIDs() failed with %d\n", err ); + return 1; + } + + err = clGetDeviceIDs(platform[0], CL_DEVICE_TYPE_CPU, 1, &device, NULL); + if (err != CL_SUCCESS) { + printf( "clGetDeviceIDs() failed with %d\n", err ); + return 1; + } + + props[1] = (cl_context_properties)platform; + ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); + if (err != CL_SUCCESS) { + printf( "clCreateContext() failed with %d\n", err ); + return 1; + } + + queue = clCreateCommandQueue(ctx, device, 0, &err); + if (err != CL_SUCCESS) { + printf( "clCreateCommandQueue() failed with %d\n", err ); + clReleaseContext(ctx); + return 1; + } + + /* Setup clblas. */ + err = clblasSetup(); + if (err != CL_SUCCESS) { + printf("clblasSetup() failed with %d\n", err); + clReleaseCommandQueue(queue); + clReleaseContext(ctx); + return 1; + } + + /* Prepare OpenCL memory objects and place matrices inside them. */ + bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * M * sizeof(*A), + NULL, &err); + bufB = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*B), + NULL, &err); + + err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, + M * M * sizeof(*A), A, 0, NULL, NULL); + err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, + M * N * sizeof(*B), B, 0, NULL, NULL); + + /* Call clblas function. Perform TRSM for the lower right sub-matrices */ + err = clblasCtrsm(order, side, uploA, transA, diagA, M - off, N - off, + alpha, bufA, offA, lda, bufB, offB, ldb, 1, &queue, 0, + NULL, &event); + if (err != CL_SUCCESS) { + printf("clblasStrsmEx() failed with %d\n", err); + ret = 1; + } + else { + /* Wait for calculations to be finished. */ + err = clWaitForEvents(1, &event); + + /* Fetch results of calculations from GPU memory. */ + err = clEnqueueReadBuffer(queue, bufB, CL_TRUE, 0, + M * N * sizeof(*result), + result, 0, NULL, NULL); + + /* At this point you will get the result of STRSM placed in 'result' array. */ + puts(""); + printResult("clblasCtrsmEx result"); + } + + /* Release OpenCL events. */ + clReleaseEvent(event); + + /* Release OpenCL memory objects. */ + clReleaseMemObject(bufB); + clReleaseMemObject(bufA); + + /* Finalize work with clblas. */ + clblasTeardown(); + + /* Release OpenCL working objects. */ + clReleaseCommandQueue(queue); + clReleaseContext(ctx); + + return ret; +} diff --git a/src/samples/example_strsm.cpp b/src/samples/example_strsm.cpp new file mode 100644 index 00000000..da515147 --- /dev/null +++ b/src/samples/example_strsm.cpp @@ -0,0 +1,188 @@ +/* ************************************************************************ + * Copyright 2013 Advanced Micro Devices, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ************************************************************************/ + + +#include +#include +#include + +/* Include CLBLAS header. It automatically includes needed OpenCL header, + * so we can drop out explicit inclusion of cl.h header. + */ +#include + +/* This example uses predefined matrices and their characteristics for + * simplicity purpose. + */ +static const clblasOrder order = clblasColumnMajor; +static const clblasSide side = clblasLeft; +static const clblasTranspose transA = clblasNoTrans; +static const clblasUplo uploA = clblasUpper; +static const clblasDiag diagA = clblasNonUnit; + +static const cl_float alpha = 10; +static const size_t M = 64; +static const size_t N = 64; +static const size_t lda = M; /* i.e. lda = M */ +static const size_t ldb = N; /* i.e. ldb = N */ + +static cl_float A[lda * M]; +static cl_float B[ldb * N]; +static cl_float result[ldb * N]; /* ldb*N */ + +static const size_t off = 0; +static const size_t offA = 0; /* M + off */ +static const size_t offB = 0; /* N + off */ + +static void +makeScaledIdentity(cl_float* matx, size_t M, size_t N, float scale ) +{ + for( size_t i = 0; i < M; ++i ) + for (size_t j = 0; j < N; ++j) + { + matx[i * M + j] = 0.0f; + if( i == j ) + matx[i * M + j] = 1.0f * scale; + } + +} + +static void +printResult(const char* str) +{ + size_t i, j, nrows; + + printf("%s:\n", str); + + nrows = (sizeof(result) / sizeof(cl_float)) / ldb; + for (i = 0; i < nrows; i++) { + for (j = 0; j < ldb; j++) { + printf("%.5e ", result[i * ldb + j]); + } + printf("\n"); + } +} + +int +main(void) +{ + cl_int err; + // Increase platforms array for system needs; 2 covers most situations + cl_platform_id platforms[] = { 0,0 }; + cl_device_id device = 0; + cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; + cl_context ctx = 0; + cl_command_queue queue = 0; + cl_mem bufA, bufB; + cl_event event = NULL; + int ret = 0; + + makeScaledIdentity( A, M, N, 1.0f ); + makeScaledIdentity( B, M, N, 1.0f); + makeScaledIdentity( result, M, N, 0.0f); + + /* Setup OpenCL environment. */ + err = clGetPlatformIDs( sizeof( platforms )/ sizeof( cl_platform_id ), &platforms[0], NULL); + if (err != CL_SUCCESS) { + printf( "clGetPlatformIDs() failed with %d\n", err ); + return 1; + } + + // Change this statement to pick the desired platform under test + cl_platform_id test_platform = platforms[1]; + + //!!! Change device type to validate; works on GPU, faults on CPU + err = clGetDeviceIDs(test_platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); + if (err != CL_SUCCESS) { + printf( "clGetDeviceIDs() failed with %d\n", err ); + return 1; + } + + props[1] = (cl_context_properties)test_platform; + ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); + if (err != CL_SUCCESS) { + printf( "clCreateContext() failed with %d\n", err ); + return 1; + } + + queue = clCreateCommandQueue(ctx, device, 0, &err); + if (err != CL_SUCCESS) { + printf( "clCreateCommandQueue() failed with %d\n", err ); + clReleaseContext(ctx); + return 1; + } + + /* Setup clblas. */ + err = clblasSetup(); + if (err != CL_SUCCESS) { + printf("clblasSetup() failed with %d\n", err); + clReleaseCommandQueue(queue); + clReleaseContext(ctx); + return 1; + } + + /* Prepare OpenCL memory objects and place matrices inside them. */ + bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, lda * M * sizeof(*A), + NULL, &err); + bufB = clCreateBuffer(ctx, CL_MEM_READ_WRITE, ldb * N * sizeof(*B), + NULL, &err); + + err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, + lda * M * sizeof(*A), A, 0, NULL, NULL); + err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, + ldb * N * sizeof(*B), B, 0, NULL, NULL); + + /* Call clblas function. Perform TRSM for the lower right sub-matrices */ + // A is identity matrix + // B is identity matrix + // Solving for identity matrices should yield an identity matrix scaled by alpha + err = clblasStrsm(order, side, uploA, transA, diagA, M - off, N - off, + alpha, bufA, offA, lda, bufB, offB, ldb, 1, &queue, 0, + NULL, &event); + if (err != CL_SUCCESS) { + printf("clblasStrsmEx() failed with %d\n", err); + ret = 1; + } + else { + /* Wait for calculations to be finished. */ + err = clWaitForEvents(1, &event); + + /* Fetch results of calculations from opencl memory. */ + err = clEnqueueReadBuffer(queue, bufB, CL_TRUE, 0, + ldb * N * sizeof(*result), + result, 0, NULL, NULL); + + // At this point, 'result' should contain a scaled identity matrix + puts(""); + printResult("clblasStrsm result"); + } + + /* Release OpenCL events. */ + clReleaseEvent(event); + + /* Release OpenCL memory objects. */ + clReleaseMemObject(bufB); + clReleaseMemObject(bufA); + + /* Finalize work with clblas. */ + clblasTeardown(); + + /* Release OpenCL working objects. */ + clReleaseCommandQueue(queue); + clReleaseContext(ctx); + + return ret; +} diff --git a/src/tests/BlasBase.cpp b/src/tests/BlasBase.cpp index 73a6f5e2..1bcc5d40 100644 --- a/src/tests/BlasBase.cpp +++ b/src/tests/BlasBase.cpp @@ -105,10 +105,11 @@ BlasBase::getDevice(cl_device_type type, const char* name, { cl_int err; cl_uint nrDevices, i, p; - cl_device_id *devices, result = NULL; + cl_device_id *devices = NULL; + cl_device_id result = 0; size_t sz; char *str; - cl_platform_id *platforms, selPlatform = NULL; + cl_platform_id* platforms = NULL; cl_uint nrPlatforms; nrPlatforms = getPlatforms(&platforms, &err); diff --git a/src/tests/cmdline.c b/src/tests/cmdline.c index c9519cc6..235367da 100644 --- a/src/tests/cmdline.c +++ b/src/tests/cmdline.c @@ -18,6 +18,7 @@ #include /* strcmp */ #include /* atoi, strtol */ #include /* printf */ +#include #include @@ -92,7 +93,7 @@ doParseCmdLine( currArg = (const char*)argv[i]; i++; - if (currArg[0] != '-') { + if ( (currArg[0] != '-') && isdigit( currArg[0] ) ){ // some of size arguments switch (j) { case 0: From a71aa63e4006884030329d680cdaee189d75f4a3 Mon Sep 17 00:00:00 2001 From: Kent Knox Date: Mon, 16 Jan 2017 13:55:49 -0600 Subject: [PATCH 44/45] Bump version to 2.12.0 --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f38275fc..6a88c410 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -108,7 +108,7 @@ if( NOT DEFINED clBLAS_VERSION_MAJOR ) endif( ) if( NOT DEFINED clBLAS_VERSION_MINOR ) - set( clBLAS_VERSION_MINOR 11 ) + set( clBLAS_VERSION_MINOR 12 ) endif( ) if( NOT DEFINED clBLAS_VERSION_PATCH ) From b567cd4fa47b362a23939d26235b32466b8e7aed Mon Sep 17 00:00:00 2001 From: Kent Knox Date: Tue, 17 Jan 2017 14:07:46 -0600 Subject: [PATCH 45/45] Update README with release notes --- README.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 8de7d7ec..8fc9492d 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ This repository houses the code for the OpenCLâ„¢ BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate -debugging and multicore programming. APPML 1.10 is the most current +debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms. @@ -23,13 +23,12 @@ library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves. -## clBLAS update notes 09/2015 - -- Introducing [AutoGemm](http://github.com/clMathLibraries/clBLAS/wiki/AutoGemm) - - clBLAS's Gemm implementation has been comprehensively overhauled to use AutoGemm. AutoGemm is a suite of python scripts which generate optimized kernels and kernel selection logic, for all precisions, transposes, tile sizes and so on. - - CMake is configured to use AutoGemm for clBLAS so the build and usage experience of Gemm remains unchanged (only performance and maintainability has been improved). Kernel sources are generated at build time (not runtime) and can be configured within CMake to be pre-compiled at build time. - - clBLAS users with unique Gemm requirements can customize AutoGemm to their needs (such as non-default tile sizes for very small or very skinny matrices); see [AutoGemm](http://github.com/clMathLibraries/clBLAS/wiki/AutoGemm) documentation for details. +## clBLAS update notes 01/2017 +- v2.12 is a bugfix release as a rollup of all fixes in /develop branch + - Thanks to @pavanky, @iotamudelta, @shahsan10, @psyhtest, @haahh, @hughperkins, @tfauck + @abhiShandy, @IvanVergiliev, @zougloub, @mgates3 for contributions to clBLAS v2.12 +- Summary of fixes available to read on the releases tab ## clBLAS library user documentation @@ -202,7 +201,7 @@ The simple example below shows how to use clBLAS to compute an OpenCL accelerate - Netlib CBLAS (recommended) Ubuntu: install by "apt-get install libblas-dev" Windows: download & install lapack-3.6.0 which comes with CBLAS - - or ACML on windows/linux; Accelerate on Mac OSX + - or ACML on windows/linux; Accelerate on Mac OSX ### Performance infrastructure * Python