From 419d595ad988f33a72843394d12d0557909f9ced Mon Sep 17 00:00:00 2001 From: Andy Zeng Date: Sat, 17 Sep 2016 18:03:45 -0400 Subject: [PATCH] add convnet-training code --- README.md | 43 +- convnet-training/apc.hpp | 251 + convnet-training/compile.sh | 13 + convnet-training/marvin.cu | 78 + convnet-training/marvin.hpp | 8010 +++++++++++++++++ .../models/hha-fcn/train_shelf_depth.json | 500 + .../models/hha-fcn/train_tote_depth.json | 500 + .../models/rgb-fcn/train_shelf_color.json | 506 ++ .../models/rgb-fcn/train_tote_color.json | 506 ++ .../rgb-hha-fcn/train_shelf_color_depth.json | 909 ++ .../rgb-hha-fcn/train_tote_color_depth.json | 909 ++ .../models/weights/download_weights.sh | 2 + convnet-training/util/depth_utils.h | 25 + convnet-training/util/random_utils.h | 37 + convnet-training/util/system_utils.h | 20 + .../src/marvin_convnet/src/detect.cu | 22 +- .../src/marvin_convnet/src/save_images.cpp | 9 + 17 files changed, 12325 insertions(+), 15 deletions(-) create mode 100755 convnet-training/apc.hpp create mode 100755 convnet-training/compile.sh create mode 100755 convnet-training/marvin.cu create mode 100755 convnet-training/marvin.hpp create mode 100644 convnet-training/models/hha-fcn/train_shelf_depth.json create mode 100644 convnet-training/models/hha-fcn/train_tote_depth.json create mode 100644 convnet-training/models/rgb-fcn/train_shelf_color.json create mode 100644 convnet-training/models/rgb-fcn/train_tote_color.json create mode 100644 convnet-training/models/rgb-hha-fcn/train_shelf_color_depth.json create mode 100644 convnet-training/models/rgb-hha-fcn/train_tote_color_depth.json create mode 100644 convnet-training/models/weights/download_weights.sh create mode 100755 convnet-training/util/depth_utils.h create mode 100755 convnet-training/util/random_utils.h create mode 100755 convnet-training/util/system_utils.h diff --git a/README.md b/README.md index 00e9285..a1b8c45 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,13 @@ # MIT-Princeton Vision Toolbox for the APC 2016 -UNFINISHED ... PLEASE DO NOT USE + ## Documentation * [Realsense Standalone](#realsense-standalone) * [Realsense ROS Package](#realsense-ros-package) * [Deep Learning FCN ROS Package](#deep-learning-fcn-ros-package) +* [FCN Training with Marvin](#fcn-training-with-marvin) +* [Evaluation Code](#evaluation-code) ## Realsense Standalone @@ -79,7 +81,7 @@ rosrun realsense_camera capture ## Deep Learning FCN ROS Package -A C++ ROS package for deep learning based object segmentation using [FCNs (Fully Convolutional Networks)](https://arxiv.org/abs/1411.4038) with [Marvin](http://marvin.is/), a lightweight GPU-only neural network framework. This package feeds RGB-D data forward through a pre-trained ConvNet to retrieve object segmentation results. The neural networks are trained offline with Marvin. +A C++ ROS package for deep learning based object segmentation using [FCNs (Fully Convolutional Networks)](https://arxiv.org/abs/1411.4038) with [Marvin](http://marvin.is/), a lightweight GPU-only neural network framework. This package feeds RGB-D data forward through a pre-trained ConvNet to retrieve object segmentation results. The neural networks are trained offline with Marvin (see [FCN Training with Marvin](#fcn-training-with-marvin)). See `ros-packages/marvin_convnet` @@ -120,8 +122,45 @@ ros package to compute hha `rosservice call /marvin_convnet ["elmers_washable_no_run_school_glue","expo_dry_erase_board_eraser"] 0 0` +## FCN Training with Marvin + +Code and models for training object segmentation using [FCNs (Fully Convolutional Networks)](https://arxiv.org/abs/1411.4038) with [Marvin](http://marvin.is/), a lightweight GPU-only neural network framework. Includes network architecture .json files in `convnet-training/models` and a Marvin data layer in `convnet-training/apc.hpp` that randomly samples images (RGB and HHA) from the segmentation training dataset [here](http://www.cs.princeton.edu/~andyz/apc2016). + +See `convnet-training` + +### Dependencies + +1. [CUDA 7.5](https://developer.nvidia.com/cuda-downloads) and [cuDNN 5](https://developer.nvidia.com/cudnn). You may need to register with NVIDIA. Below are some additional steps to set up cuDNN 5. **NOTE** We highly recommend that you install different versions of cuDNN to different directories (e.g., ```/usr/local/cudnn/vXX```) because different software packages may require different versions. + +```shell +LIB_DIR=lib$([[ $(uname) == "Linux" ]] && echo 64) +CUDNN_LIB_DIR=/usr/local/cudnn/v5/$LIB_DIR +echo LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDNN_LIB_DIR >> ~/.profile && ~/.profile + +tar zxvf cudnn*.tgz +sudo cp cuda/$LIB_DIR/* $CUDNN_LIB_DIR/ +sudo cp cuda/include/* /usr/local/cudnn/v5/include/ +``` + +2. OpenCV (tested with OpenCV 2.4.11) + * Used for reading images + +### Setup Instructions +1. Download segmentation training dataset from [here](http://www.cs.princeton.edu/~andyz/apc2016) +2. Specify training dataset filepath in APCData layer of network architecture in `convnet-training/models/train_shelf_color.json` +3. Navigate to `convnet-training/models/weights/` and run bash script `./download_weights.sh` to download VGG pre-trained weights on ImageNet (see [Marvin](http://marvin.is/) for more pre-trained weights) +4. Navigate to `convnet-training/` and run in terminal `./compile.sh` to compile Marvin. +5. Run in terminal `./marvin train models/rgb-fcn/train_shelf_color.json models/weights/vgg16_imagenet_half.marvin` to train a segmentation model on RGB-D data with objects in the shelf (for objects in the tote, use network architecture `models/rgb-fcn/train_shelf_color.json`). + +## Evaluation Code +Code used to perform the experiments in the paper - tests the full vision system on the 'Shelf & Tote' benchmark dataset. +See `evaluation` +### Setup Instructions +1. Download the full 'Shelf & Tote' benchmark dataset from [here](http://www.cs.princeton.edu/~andyz/apc2016) and extract its contents to `apc-vision-toolbox/data/benchmark` (e.g. `apc-vision-toolbox/data/benchmark/office`, `apc-vision-toolbox/data/benchmark/warehouse', etc.) +2. In `evaluation/getError.m`, change the variable `benchmarkPath` to point to the filepath of your benchmark dataset directory +3. We have provided our vision system's predictions in a saved Matlab .mat file `evaluation/predictions.mat`. To compute the accuracy of these predictions against the ground truth labels of the 'Shelf & Tote' benchmark dataset, run `evaluation/getError.m` diff --git a/convnet-training/apc.hpp b/convnet-training/apc.hpp new file mode 100755 index 0000000..ce77aa6 --- /dev/null +++ b/convnet-training/apc.hpp @@ -0,0 +1,251 @@ +// --------------------------------------------------------- +// Copyright (c) 2016, Andy Zeng +// +// This file is part of the APC Vision Toolbox and is available +// under the terms of the Simplified BSD License provided in +// LICENSE. Please retain this notice and LICENSE if you use +// this file (or any portion of it) in your project. +// --------------------------------------------------------- + +#include "system_utils.h" +#include "depth_utils.h" +#include "random_utils.h" + +template +class APCDataLayer : public DataLayer { + std::future lock; + + std::vector dataFILE; + std::vector dataCPU; + std::vector dataGPU; + std::vector labelCPU; + std::vector labelGPU; + + int epoch_prefetch; +public: + std::vector file_data; + + int batch_size; + int curr_obj_idx = 0; + std::vector object_list; + int num_objects; + + int numofitems() { return 0; }; + + void init() { + epoch_prefetch = 0; + train_me = true; + std::cout << "APCDataLayer: " << std::endl; + dataCPU.resize(2); + dataGPU.resize(2); + labelCPU.resize(1); + labelGPU.resize(1); + dataFILE.resize(file_data.size()); + + // List objects found under data directory + std::cout << " Loading data from directory: " << file_data[0] << std::endl; + GetFilesInDirectory(file_data[0], object_list, ""); + num_objects = object_list.size(); + std::sort(object_list.begin(), object_list.end()); + for (int i = 0 ; i < num_objects; i++) + std::cout << " " << object_list[i] << std::endl; + + // Compute batch data sizes + std::vector image_dim; + image_dim.push_back(batch_size); image_dim.push_back(3); image_dim.push_back(480); image_dim.push_back(640); + dataCPU[0] = new StorageT[numel(image_dim)]; + dataCPU[1] = new StorageT[numel(image_dim)]; + + // Compute batch label sizes + std::vector label_dim; + label_dim.push_back(batch_size); label_dim.push_back(1); label_dim.push_back(480); label_dim.push_back(640); + labelCPU[0] = new StorageT[numel(label_dim)]; + }; + + APCDataLayer(std::string name_, Phase phase_, std::vector file_data_, int batch_size_): + DataLayer(name_), file_data(file_data_), batch_size(batch_size_) { + phase = phase_; + init(); + }; + + APCDataLayer(JSON* json) { + SetOrDie(json, name) + SetValue(json, phase, Training) + SetOrDie(json, file_data ) + SetOrDie(json, batch_size ) + init(); + }; + + ~APCDataLayer() { + if (lock.valid()) lock.wait(); + for (int i = 0; i < dataFILE.size(); ++i) + if (dataFILE[i] != NULL) fclose(dataFILE[i]); + for (int i = 0; i < dataCPU.size(); ++i) + if (dataCPU[i] != NULL) delete [] dataCPU[i]; + for (int i = 0; i < labelCPU.size(); ++i) + if (labelCPU[i] != NULL) delete [] labelCPU[i]; + for (int i = 0; i < dataGPU.size(); ++i) + if (dataGPU[i] != NULL) checkCUDA(__LINE__, cudaFree(dataGPU[i])); + for (int i = 0; i < labelGPU.size(); ++i) + if (labelGPU[i] != NULL) checkCUDA(__LINE__, cudaFree(labelGPU[i])); + }; + + void shuffle() {}; + + void prefetch() { + + checkCUDA(__LINE__, cudaSetDevice(GPU)); + + std::string data_directory = file_data[0]; + + for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) { + std::string curr_obj_name = object_list[curr_obj_idx]; + std::string curr_obj_directory = data_directory + "/" + curr_obj_name; + + // Select a random sequence from object directory + std::vector sequence_list; + GetFilesInDirectory(curr_obj_directory, sequence_list, "seq-0"); + std::sort(sequence_list.begin(), sequence_list.end()); + int rand_sequence_idx = (int)floor(GetRandomFloat(0, (float)sequence_list.size())); + std::string curr_sequence_name = sequence_list[rand_sequence_idx]; + std::string curr_sequence_directory = curr_obj_directory + "/" + curr_sequence_name; + + // Select a random image from the sequence + std::vector image_list; + GetFilesInDirectory(curr_sequence_directory, image_list, ".color.png"); + std::sort(image_list.begin(), image_list.end()); + int rand_image_idx = (int)floor(GetRandomFloat(0, (float)image_list.size())); + std::string curr_image_name = image_list[rand_image_idx]; + + // Debug + // std::cout << curr_sequence_directory + "/" + curr_image_name << std::endl; + + // Read color RGB data (BGR, mean subtracted) + std::string curr_RGB_file = curr_sequence_directory + "/" + curr_image_name; + cv::Mat curr_RGB_image = cv::imread(curr_RGB_file.c_str(), CV_LOAD_IMAGE_COLOR); + uint8_t * curr_RGB_raw = curr_RGB_image.data; + StorageT * curr_RGB_data = new StorageT[3 * 480 * 640]; + for (int tmp_row = 0; tmp_row < 480; tmp_row++) + for (int tmp_col = 0; tmp_col < 640; tmp_col++) { + // curr_RGB_data[0 * 480 * 640 + tmp_row * 640 + tmp_col] = CPUCompute2StorageT(((float) curr_RGB_image.at(tmp_row, tmp_col)[0]) - 102.9801f); //102.9801f; // B + // curr_RGB_data[1 * 480 * 640 + tmp_row * 640 + tmp_col] = CPUCompute2StorageT(((float) curr_RGB_image.at(tmp_row, tmp_col)[1]) - 115.9465f); //115.9465f; // G + // curr_RGB_data[2 * 480 * 640 + tmp_row * 640 + tmp_col] = CPUCompute2StorageT(((float) curr_RGB_image.at(tmp_row, tmp_col)[2]) - 122.7717f); //122.7717f; // R + curr_RGB_data[0 * 480 * 640 + tmp_row * 640 + tmp_col] = CPUCompute2StorageT(ComputeT(curr_RGB_raw[0 + 3 * (tmp_col + 640 * tmp_row)]) - ComputeT(102.9801f)); // B + curr_RGB_data[1 * 480 * 640 + tmp_row * 640 + tmp_col] = CPUCompute2StorageT(ComputeT(curr_RGB_raw[1 + 3 * (tmp_col + 640 * tmp_row)]) - ComputeT(115.9465f)); // G + curr_RGB_data[2 * 480 * 640 + tmp_row * 640 + tmp_col] = CPUCompute2StorageT(ComputeT(curr_RGB_raw[2 + 3 * (tmp_col + 640 * tmp_row)]) - ComputeT(122.7717f)); // R + } + + // Read depth HHA data (BGR, mean subtracted) + std::string curr_HHA_file = curr_sequence_directory + "/HHA/" + curr_image_name.substr(0, curr_image_name.length() - 10) + ".HHA.png"; + cv::Mat curr_HHA_image = cv::imread(curr_HHA_file.c_str(), CV_LOAD_IMAGE_COLOR); + uint8_t * curr_HHA_raw = curr_HHA_image.data; + StorageT * curr_HHA_data = new StorageT[3 * 480 * 640]; + for (int tmp_row = 0; tmp_row < 480; tmp_row++) + for (int tmp_col = 0; tmp_col < 640; tmp_col++) { + // curr_HHA_data[0 * 480 * 640 + tmp_row * 640 + tmp_col] = CPUCompute2StorageT(((float) curr_HHA_image.at(tmp_row, tmp_col)[0]) - 102.9801f); //102.9801f; // B + // curr_HHA_data[1 * 480 * 640 + tmp_row * 640 + tmp_col] = CPUCompute2StorageT(((float) curr_HHA_image.at(tmp_row, tmp_col)[1]) - 115.9465f); //115.9465f; // G + // curr_HHA_data[2 * 480 * 640 + tmp_row * 640 + tmp_col] = CPUCompute2StorageT(((float) curr_HHA_image.at(tmp_row, tmp_col)[2]) - 122.7717f); //122.7717f; // R + curr_HHA_data[0 * 480 * 640 + tmp_row * 640 + tmp_col] = CPUCompute2StorageT(ComputeT(curr_HHA_raw[0 + 3 * (tmp_col + 640 * tmp_row)]) - ComputeT(102.9801f)); // B + curr_HHA_data[1 * 480 * 640 + tmp_row * 640 + tmp_col] = CPUCompute2StorageT(ComputeT(curr_HHA_raw[1 + 3 * (tmp_col + 640 * tmp_row)]) - ComputeT(115.9465f)); // G + curr_HHA_data[2 * 480 * 640 + tmp_row * 640 + tmp_col] = CPUCompute2StorageT(ComputeT(curr_HHA_raw[2 + 3 * (tmp_col + 640 * tmp_row)]) - ComputeT(122.7717f)); // R + } + + // Debug + // FILE * fp = fopen(("test" + std::to_string(batch_idx) + ".color.bin").c_str(), "wb"); + // fwrite(curr_RGB_data, sizeof(float), 3 * 480 * 640, fp); + // fclose(fp); + + // Read label masks + std::string curr_mask_file = curr_sequence_directory + "/masks/" + curr_image_name.substr(0, curr_image_name.length() - 10) + ".mask.png"; + cv::Mat curr_mask = cv::imread(curr_mask_file.c_str(), CV_LOAD_IMAGE_GRAYSCALE); + StorageT * curr_label_data = new StorageT[480 * 640]; + for (int tmp_row = 0; tmp_row < 480; tmp_row++) + for (int tmp_col = 0; tmp_col < 640; tmp_col++) { + if (((int) curr_mask.at(tmp_row, tmp_col)) > 0) + curr_label_data[tmp_row * 640 + tmp_col] = CPUCompute2StorageT(curr_obj_idx + 1); // Give object label + else + curr_label_data[tmp_row * 640 + tmp_col] = CPUCompute2StorageT(0); // Background + } + + // Debug + // fp = fopen(("test" + std::to_string(batch_idx) + ".mask.bin").c_str(), "wb"); + // fwrite(curr_label_data, sizeof(float), 480 * 640, fp); + // fclose(fp); + + // Copy data to GPU + checkCUDA(__LINE__, cudaMemcpy(&(dataGPU[0][batch_idx * 3 * 480 * 640]), curr_RGB_data, 3 * 480 * 640 * sizeofStorageT, cudaMemcpyHostToDevice)); + checkCUDA(__LINE__, cudaMemcpy(&(dataGPU[1][batch_idx * 3 * 480 * 640]), curr_HHA_data, 3 * 480 * 640 * sizeofStorageT, cudaMemcpyHostToDevice)); + checkCUDA(__LINE__, cudaMemcpy(&(labelGPU[0][batch_idx * 480 * 640]), curr_label_data, 480 * 640 * sizeofStorageT, cudaMemcpyHostToDevice)); + + // Clear memory + delete [] curr_RGB_data; + delete [] curr_HHA_data; + delete [] curr_label_data; + + // Iterate through object list + curr_obj_idx = curr_obj_idx + 1; + if (curr_obj_idx >= num_objects) + curr_obj_idx = 0; + } + + // Debug + // float * image_data = new float[batch_size * 3 * 480 * 640]; + // float * label_data = new float[batch_size * 1 * 480 * 640]; + // checkCUDA(__LINE__, cudaMemcpy(image_data, dataGPU[0], batch_size * 3 * 480 * 640 * sizeof(float), cudaMemcpyDeviceToHost)); + // checkCUDA(__LINE__, cudaMemcpy(label_data, labelGPU[0], batch_size * 480 * 640 * sizeof(float), cudaMemcpyDeviceToHost)); + // for (int i = 0; i < batch_size * 3 * 480 * 640; i++) + // std::cout << image_data[i] << std::endl; + // for (int i = 0; i < batch_size * 480 * 640; i++) + // std::cout << label_data[i] << std::endl; + }; + + void forward(Phase phase_) { + lock.wait(); + epoch = epoch_prefetch; + std::swap(out[0]->dataGPU, dataGPU[0]); + std::swap(out[1]->dataGPU, dataGPU[1]); + std::swap(out[2]->dataGPU, labelGPU[0]); + lock = std::async(std::launch::async, &APCDataLayer::prefetch, this); + }; + + + size_t Malloc(Phase phase_) { + if (phase == Training && phase_ == Testing) return 0; + if (out.size() != 3) { + std::cout << "APCDataLayer: incorrect # of out's" << std::endl; + FatalError(__LINE__); + } + size_t memoryBytes = 0; + std::cout << (train_me ? "* " : " "); + std::cout << name << std::endl; + + // CPU/GPU malloc data + std::vector image_dim; + image_dim.push_back(batch_size); image_dim.push_back(3); image_dim.push_back(480); image_dim.push_back(640); + out[0]->need_diff = false; + out[0]->receptive_field.resize(image_dim.size() - 2); fill_n(out[0]->receptive_field.begin(), image_dim.size() - 2, 1); + out[0]->receptive_gap.resize(image_dim.size() - 2); fill_n(out[0]->receptive_gap.begin(), image_dim.size() - 2, 1); + out[0]->receptive_offset.resize(image_dim.size() - 2); fill_n(out[0]->receptive_offset.begin(), image_dim.size() - 2, 0); + memoryBytes += out[0]->Malloc(image_dim); + checkCUDA(__LINE__, cudaMalloc(&dataGPU[0], numel(image_dim) * sizeofStorageT) ); + memoryBytes += numel(image_dim) * sizeofStorageT; + out[1]->need_diff = false; + out[1]->receptive_field.resize(image_dim.size() - 2); fill_n(out[1]->receptive_field.begin(), image_dim.size() - 2, 1); + out[1]->receptive_gap.resize(image_dim.size() - 2); fill_n(out[1]->receptive_gap.begin(), image_dim.size() - 2, 1); + out[1]->receptive_offset.resize(image_dim.size() - 2); fill_n(out[1]->receptive_offset.begin(), image_dim.size() - 2, 0); + memoryBytes += out[1]->Malloc(image_dim); + checkCUDA(__LINE__, cudaMalloc(&dataGPU[1], numel(image_dim) * sizeofStorageT) ); + memoryBytes += numel(image_dim) * sizeofStorageT; + + // CPU/GPU malloc labels + std::vector label_dim; + label_dim.push_back(batch_size); label_dim.push_back(1); label_dim.push_back(480); label_dim.push_back(640); + out[2]->need_diff = false; + memoryBytes += out[2]->Malloc(label_dim); + checkCUDA(__LINE__, cudaMalloc(&labelGPU[0], numel(label_dim) * sizeofStorageT) ); + memoryBytes += numel(label_dim) * sizeofStorageT; + + lock = std::async(std::launch::async, &APCDataLayer::prefetch, this); + return memoryBytes; + }; +}; diff --git a/convnet-training/compile.sh b/convnet-training/compile.sh new file mode 100755 index 0000000..c4bb2ad --- /dev/null +++ b/convnet-training/compile.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PATH=$PATH:/usr/local/cuda/bin + +if uname | grep -q Darwin; then + CUDA_LIB_DIR=/usr/local/cuda/lib + CUDNN_LIB_DIR=/usr/local/cudnn/v5/lib +elif uname | grep -q Linux; then + CUDA_LIB_DIR=/usr/local/cuda/lib64 + CUDNN_LIB_DIR=/usr/local/cudnn/v5/lib64 +fi + +nvcc -ccbin /usr/bin/g++ -std=c++11 -O3 -o marvin marvin.cu -I./util -I/usr/local/cuda/include -I/usr/local/cudnn/v5/include -L$CUDA_LIB_DIR -L$CUDNN_LIB_DIR -lcudart -lcublas -lcudnn -lcurand -D_MWAITXINTRIN_H_INCLUDED `pkg-config --cflags opencv` `pkg-config --libs opencv` diff --git a/convnet-training/marvin.cu b/convnet-training/marvin.cu new file mode 100755 index 0000000..2cd6251 --- /dev/null +++ b/convnet-training/marvin.cu @@ -0,0 +1,78 @@ +// Please choose a data type to compile +#define DATATYPE 1 +#include "marvin.hpp" + +using namespace marvin; +using namespace std; + +int main(int argc, char **argv){ + + if (argc < 3 || argc >10){ + cout<<"Usage:"<> Hello, World! This is Marvin. I am at a rough estimate thirty billion times more intelligent than you. Let me give you an example."< models = getStringVector(argv[3]); + for (int m=0;m models = getStringVector(argv[3]); + for (int m=0;m=6){ + int itersPerSave = 0; + if (argc==7){ + itersPerSave = atoi(argv[6]); + } + net.test(getStringVector(argv[4]), getStringVector(argv[5]), itersPerSave); + }else if (argc==4){ + net.test(); + }else FatalError(__LINE__); + + }else if(0==strcmp(argv[1], "activate")){ + + Net net(argv[2]); + net.Malloc(Testing); + + vector models = getStringVector(argv[3]); + for (int m=0;m +#elif DATATYPE==1 + #pragma message "Compiling using StorageT=float ComputeT=float" + #define StorageT float + #define ComputeT float + #define sizeofStorageT 4 + #define sizeofComputeT 4 + #define CUDNNStorageT CUDNN_DATA_FLOAT + #define CUDNNConvComputeT CUDNN_DATA_FLOAT + #define CPUStorage2ComputeT(x) (x) + #define CPUCompute2StorageT(x) (x) + #define GPUStorage2ComputeT(x) (x) + #define GPUCompute2StorageT(x) (x) + #define GPUgemm cublasSgemm + #define GPUasum cublasSasum + #define ISNAN(x) (std::isnan(x)) + #define ComputeT_MIN FLT_MIN +#elif DATATYPE==2 + #pragma message "Compiling using StorageT=double ComputeT=double" + #define StorageT double + #define ComputeT double + #define sizeofStorageT 8 + #define sizeofComputeT 8 + #define CUDNNStorageT CUDNN_DATA_DOUBLE + #define CUDNNConvComputeT CUDNN_DATA_DOUBLE + #define CPUStorage2ComputeT(x) (x) + #define CPUCompute2StorageT(x) (x) + #define GPUStorage2ComputeT(x) (x) + #define GPUCompute2StorageT(x) (x) + #define GPUgemm cublasDgemm + #define GPUasum cublasDasum + #define ISNAN(x) (std::isnan(x)) + #define ComputeT_MIN DBL_MIN +#endif + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Includes +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace marvin { + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Type definition +////////////////////////////////////////////////////////////////////////////////////////////////// + +enum Filler { Xavier, Gaussian, Constant }; +enum Pool { Max, Average, Sum }; +enum LossObjective { MultinomialLogistic_StableSoftmax, MultinomialLogistic, SmoothL1, Contrastive, EuclideanSSE, HingeL1, HingeL2, SigmoidCrossEntropy, Infogain }; +enum Phase { Training, Testing, TrainingTesting }; +enum LRPolicy { LR_fixed, LR_step, LR_exp, LR_inv, LR_multistep, LR_poly, LR_sigmoid, LR_cyclical }; +enum SolverAlgorithm { SGD, AdaDelta, AdaGrad, Adam, NAG, RMSprop}; +enum Regularizer { L2, L1 }; +enum LRN { CrossChannel, DivisiveNormalization }; +enum ElementWiseOp { ElementWise_EQL, ElementWise_MUL, ElementWise_SUM, ElementWise_MAX }; + + +ComputeT anyval; +ComputeT oneval = 1; +ComputeT zeroval = 0; +const void* one = static_cast(&oneval); +const void* zero = static_cast(&zeroval); +const ComputeT* oneComputeT = &oneval; +const ComputeT* zeroComputeT = &zeroval; + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Debugging utility +////////////////////////////////////////////////////////////////////////////////////////////////// + +void FatalError(const int lineNumber=0) { + std::cerr << "FatalError"; + if (lineNumber!=0) std::cerr<<" at LINE "< 0x7f800000) { + ret.x = 0x7fffU; + return ret; + } + + sign = ((x >> 16) & 0x8000); + + // Get rid of +Inf/-Inf, +0/-0. + if (u > 0x477fefff) { + ret.x = sign | 0x7c00U; + return ret; + } + if (u < 0x33000001) { + ret.x = (sign | 0x0000); + return ret; + } + + exponent = ((u >> 23) & 0xff); + mantissa = (u & 0x7fffff); + + if (exponent > 0x70) { + shift = 13; + exponent -= 0x70; + } else { + shift = 0x7e - exponent; + exponent = 0; + mantissa |= 0x800000; + } + lsb = (1 << shift); + lsb_s1 = (lsb >> 1); + lsb_m1 = (lsb - 1); + + // Round to nearest even. + remainder = (mantissa & lsb_m1); + mantissa >>= shift; + if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { + ++mantissa; + if (!(mantissa & 0x3ff)) { + ++exponent; + mantissa = 0; + } + } + + ret.x = (sign | (exponent << 10) | mantissa); + + return ret; +} + + +float cpu_half2float(half h) { + unsigned sign = ((h.x >> 15) & 1); + unsigned exponent = ((h.x >> 10) & 0x1f); + unsigned mantissa = ((h.x & 0x3ff) << 13); + + if (exponent == 0x1f) { /* NaN or Inf */ + mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); + exponent = 0xff; + } else if (!exponent) { /* Denorm or Zero */ + if (mantissa) { + unsigned int msb; + exponent = 0x71; + do { + msb = (mantissa & 0x400000); + mantissa <<= 1; /* normalize */ + --exponent; + } while (!msb); + mantissa &= 0x7fffff; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70; + } + + int temp = ((sign << 31) | (exponent << 23) | mantissa); + + return *((float*)((void*)&temp)); +} + + +bool operator <(const half& x, const half& y) { + return cpu_half2float(x) < cpu_half2float(y); +} + +std::ostream& operator<< (std::ostream& stream, const half& x) { + stream << cpu_half2float(x); + return stream; +} + +////////////////////////////////////////////////////////////////////////////////////////////////// +// JSON parser +////////////////////////////////////////////////////////////////////////////////////////////////// + +enum JSONType { JSON_String, JSON_Bool, JSON_Null, JSON_Number, JSON_Object, JSON_ObjectArray}; + +// plain object +class JSON{ +public: + JSONType type; + std::vector array; + std::map member; + + ~JSON(){ + for (int i=0;i::iterator it = member.begin(); it != member.end(); it++ ){ + if (it->second != NULL) + delete it->second; + } + }; + + std::string returnString(){ + if (type!=JSON_String) FatalError(__LINE__); + return *((std::string*)(array[0])); + }; + + bool returnBool(){ + if (type!=JSON_Bool) FatalError(__LINE__); + return *((bool*)(array[0])); + }; + + ComputeT returnReal(){ + if (type!=JSON_Number) FatalError(__LINE__); + return *((ComputeT*)(array[0])); + }; + + std::vector returnIntVector(){ + if (type!=JSON_Number) FatalError(__LINE__); + std::vector v(array.size()); + for (int i=0;i returnRealVector(){ + if (type!=JSON_Number) FatalError(__LINE__); + std::vector v(array.size()); + for (int i=0;i returnStringVector(){ + if (type!=JSON_String) FatalError(__LINE__); + std::vector v(array.size()); + for (int i=0;imember.find(name) == this->member.end()){ + FatalError(__LINE__); + } + else variable = (unsigned int)this->member[name]->returnReal(); + }; + + void setOrDie(std::string name, bool &variable){ + if (this->member.find(name) == this->member.end()){ + FatalError(__LINE__); + } + else variable = this->member[name]->returnBool(); + }; + + void setOrDie(std::string name, std::vector &variable){ + if (this->member.find(name) == this->member.end()) + FatalError(__LINE__); + else variable = this->member[name]->returnRealVector(); + }; + + void set(std::string name, bool &variable, bool default_value){ + if (this->member.find(name) == this->member.end()) variable = default_value; + else variable = this->member[name]->returnBool(); + }; + + void set(std::string name, ComputeT &variable, ComputeT default_value){ + if (this->member.find(name) == this->member.end()) variable = default_value; + else variable = (ComputeT)(this->member[name]->returnReal()); + }; + + void setOrDie(std::string name, ComputeT &variable){ + if (this->member.find(name) == this->member.end()) FatalError(__LINE__); + else variable = (ComputeT)(this->member[name]->returnReal()); + }; + + void set(std::string name, int &variable, int default_value){ + if (this->member.find(name) == this->member.end()) variable = default_value; + else variable = (int)(this->member[name]->returnReal()); + }; + + void set(std::string name, double &variable, double default_value){ + if (this->member.find(name) == this->member.end()) variable = default_value; + else variable = (double)(this->member[name]->returnReal()); + }; + + void set(std::string name, unsigned int &variable, unsigned int default_value){ + if (this->member.find(name) == this->member.end()) variable = default_value; + else variable = (unsigned int)(this->member[name]->returnReal()); + }; + + void setOrDie(std::string name, int &variable){ + if (this->member.find(name) == this->member.end()) FatalError(__LINE__); + else variable = (int)(this->member[name]->returnReal()); + }; + + void set(std::string name, std::vector &variable, std::vector default_value){ + if (this->member.find(name) == this->member.end()) variable = default_value; + else variable = this->member[name]->returnIntVector(); + }; + + void set(std::string name, std::vector &variable, std::vector default_value){ + if (this->member.find(name) == this->member.end()) variable = default_value; + else variable = this->member[name]->returnRealVector(); + }; + + void set(std::string name, std::vector &variable, std::vector default_value){ + if (this->member.find(name) == this->member.end()) variable = default_value; + else variable = this->member[name]->returnStringVector(); + }; + + void setOrDie(std::string name, std::vector &variable){ + if (this->member.find(name) == this->member.end()) FatalError(__LINE__); + else variable = this->member[name]->returnStringVector(); + }; + + void setOrDie(std::string name, std::vector &variable){ + if (this->member.find(name) == this->member.end()) FatalError(__LINE__); + else variable = this->member[name]->returnIntVector(); + }; + + void set(std::string name, std::string &variable, std::string default_value){ + if (this->member.find(name) == this->member.end()) variable = default_value; + else variable = this->member[name]->returnString(); + }; + + void setOrDie(std::string name, std::string &variable){ + if (this->member.find(name) == this->member.end()) FatalError(__LINE__); + else variable = this->member[name]->returnString(); + }; + + void setOrDie(std::string name, ElementWiseOp &variable){ + if (this->member.find(name) == this->member.end()) FatalError(__LINE__); + else if (0 == this->member[name]->returnString().compare("ElementWise_EQL")) variable = ElementWise_EQL; + else if (0 == this->member[name]->returnString().compare("ElementWise_MUL")) variable = ElementWise_MUL; + else if (0 == this->member[name]->returnString().compare("ElementWise_SUM")) variable = ElementWise_SUM; + else if (0 == this->member[name]->returnString().compare("ElementWise_MAX")) variable = ElementWise_MAX; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("Xavier")) variable = Xavier; + else if (0 == this->member[name]->returnString().compare("Gaussian")) variable = Gaussian; + else if (0 == this->member[name]->returnString().compare("Constant")) variable = Constant; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("Max")) variable = Max; + else if (0 == this->member[name]->returnString().compare("Average")) variable = Average; + else if (0 == this->member[name]->returnString().compare("Sum")) variable = Sum; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) FatalError(__LINE__); + else if (0 == this->member[name]->returnString().compare("MultinomialLogistic_StableSoftmax")) variable = MultinomialLogistic_StableSoftmax; + else if (0 == this->member[name]->returnString().compare("MultinomialLogistic")) variable = MultinomialLogistic; + else if (0 == this->member[name]->returnString().compare("SmoothL1")) variable = SmoothL1; + else if (0 == this->member[name]->returnString().compare("Contrastive")) variable = Contrastive; + else if (0 == this->member[name]->returnString().compare("EuclideanSSE")) variable = EuclideanSSE; + else if (0 == this->member[name]->returnString().compare("HingeL1")) variable = HingeL1; + else if (0 == this->member[name]->returnString().compare("HingeL2")) variable = HingeL2; + else if (0 == this->member[name]->returnString().compare("SigmoidCrossEntropy")) variable = SigmoidCrossEntropy; + else if (0 == this->member[name]->returnString().compare("Infogain")) variable = Infogain; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("Training")) variable = Training; + else if (0 == this->member[name]->returnString().compare("Testing")) variable = Testing; + else if (0 == this->member[name]->returnString().compare("TrainingTesting")) variable = TrainingTesting; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("LR_fixed")) variable = LR_fixed; + else if (0 == this->member[name]->returnString().compare("LR_step")) variable = LR_step; + else if (0 == this->member[name]->returnString().compare("LR_exp")) variable = LR_exp; + else if (0 == this->member[name]->returnString().compare("LR_inv")) variable = LR_inv; + else if (0 == this->member[name]->returnString().compare("LR_multistep")) variable = LR_multistep; + else if (0 == this->member[name]->returnString().compare("LR_poly")) variable = LR_poly; + else if (0 == this->member[name]->returnString().compare("LR_sigmoid")) variable = LR_sigmoid; + else if (0 == this->member[name]->returnString().compare("LR_cyclical")) variable = LR_cyclical; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("SGD")) variable = SGD; + else if (0 == this->member[name]->returnString().compare("AdaDelta")) variable = AdaDelta; + else if (0 == this->member[name]->returnString().compare("AdaGrad")) variable = AdaGrad; + else if (0 == this->member[name]->returnString().compare("Adam")) variable = Adam; + else if (0 == this->member[name]->returnString().compare("NAG")) variable = NAG; + else if (0 == this->member[name]->returnString().compare("RMSprop")) variable = RMSprop; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("L2")) variable = L2; + else if (0 == this->member[name]->returnString().compare("L1")) variable = L1; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("CrossChannel")) variable = CrossChannel; + else if (0 == this->member[name]->returnString().compare("DivisiveNormalization")) variable = DivisiveNormalization; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("Spatial")) variable = CUDNN_BATCHNORM_SPATIAL; + else if (0 == this->member[name]->returnString().compare("PerActivation")) variable = CUDNN_BATCHNORM_PER_ACTIVATION; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("max")) variable = CUDNN_POOLING_MAX; + else if (0 == this->member[name]->returnString().compare("average_include")) variable = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + else if (0 == this->member[name]->returnString().compare("average_exclude")) variable = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("Sigmoid")) variable = CUDNN_ACTIVATION_SIGMOID; + else if (0 == this->member[name]->returnString().compare("ReLU")) variable = CUDNN_ACTIVATION_RELU; + else if (0 == this->member[name]->returnString().compare("TanH")) variable = CUDNN_ACTIVATION_TANH; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("implicit_gemm")) variable = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + else if (0 == this->member[name]->returnString().compare("implicit_precomp_gemm")) variable = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + else if (0 == this->member[name]->returnString().compare("gemm")) variable = CUDNN_CONVOLUTION_FWD_ALGO_GEMM; + else if (0 == this->member[name]->returnString().compare("direct")) variable = CUDNN_CONVOLUTION_FWD_ALGO_DIRECT; + else if (0 == this->member[name]->returnString().compare("fft")) variable = CUDNN_CONVOLUTION_FWD_ALGO_FFT; + else if (0 == this->member[name]->returnString().compare("fft_tiling")) variable = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING; + else if (0 == this->member[name]->returnString().compare("winograd")) variable = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("0")) variable = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; + else if (0 == this->member[name]->returnString().compare("1")) variable = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + else if (0 == this->member[name]->returnString().compare("fft")) variable = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT; + else if (0 == this->member[name]->returnString().compare("fft_tiling")) variable = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING; + else if (0 == this->member[name]->returnString().compare("winograd")) variable = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD; + else{ std::cout<<"Unsupported "<member[name]->returnString()<member.find(name) == this->member.end()) variable = default_value; + else if (0 == this->member[name]->returnString().compare("0")) variable = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; + else if (0 == this->member[name]->returnString().compare("1")) variable = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + else if (0 == this->member[name]->returnString().compare("fft")) variable = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT; + else if (0 == this->member[name]->returnString().compare("3")) variable = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3; + else{ std::cout<<"Unsupported "<member[name]->returnString()<1) std::cout<<"["; + for (int i=0;i0) std::cout<< ","; + std::cout << "\"" << *((std::string*)(array[i])) << "\"" ; + } + if (array.size()>1) std::cout<<"]"; + std::cout<1) std::cout<<"["; + for (int i=0;i0) std::cout<< ","; + std::cout << ((*((bool*)(array[i])))? "true": "false"); + } + if (array.size()>1) std::cout<<"]"; + std::cout<1) std::cout<<"["; + for (int i=0;i0) std::cout<< ","; + std::cout << "null"; + } + if (array.size()>1) std::cout<<"]"; + std::cout<1) std::cout<<"["; + for (int i=0;i0) std::cout<< ","; + std::cout << *((ComputeT*)(array[i])); + } + if (array.size()>1) std::cout<<"]"; + std::cout<::iterator it = member.begin(); it != member.end(); it++ ){ + std::cout << "\t" << it->first << ": "; + it->second->print(); + } + std::cout<<"}"; + break; + case JSON_ObjectArray: + std::cout<<"["<print(); + if (i0){ + int e = input.find(","); + if (e==std::string::npos){ + e = input.size(); + } + std::string first = input.substr(0,e); + if (first[0]=='\"'){ + type = JSON_String; + std::string* p = new std::string(first.substr(1,first.size()-2)); + array.push_back((void*)p); + }else if (first[0]=='t'){ + type = JSON_Bool; + bool* p = new bool(true); + array.push_back((void*)p); + }else if (first[0]=='f'){ + type = JSON_Bool; + bool* p = new bool(false); + array.push_back((void*)p); + }else if (first[0]=='n'){ + type = JSON_Null; + void* p = NULL; + array.push_back((void*)p); + }else{ + type = JSON_Number; + ComputeT* p = new ComputeT(stof(first)); + array.push_back((void*)p); + } + if(e+1parseNumberOrTextArray(input.substr(0,e+1)); + this->member[name] = p; + + if (e+2parseNumberOrTextArray(input.substr(0,e)); + this->member[name] = p; + + if (e+1parseNumberOrTextArray(input.substr(0,e)); + this->member[name] = p; + + if (e+10){ + int e = input.find("}")+1; + if (e==std::string::npos){ + e = input.size(); + } + std::string first = input.substr(0,e); + JSON* pObj = new JSON; + pObj->parseObject(first); + array.push_back((void*)pObj); + + if(e+1set(#attribute,attribute,value); +#define SetOrDie(obj,attribute) obj->setOrDie(#attribute,attribute); + + +void parseNetworkJSON(std::string filename, JSON* train_obj, JSON* test_obj, JSON* architecture_obj){ + std::ifstream t(filename); + std::string str((std::istreambuf_iterator(t)), std::istreambuf_iterator()); + str.erase(remove_if(str.begin(), str.end(), (int(*)(int))isspace), str.end()); + std::string input = str; + int b,e; + + b = input.find("\"train\""); + std::string train_str = input.substr(b+7); + b = train_str.find("{"); + e = train_str.find("}"); + train_str=train_str.substr(b,e-b+1); + if (train_obj!=NULL) train_obj->parseObject(train_str); + + b = input.find("\"test\""); + std::string test_str = input.substr(b+6); + b = test_str.find("{"); + e = test_str.find("}"); + test_str=test_str.substr(b,e-b+1); + if (test_obj!=NULL) test_obj->parseObject(test_str); + + b=input.find("\"layers\""); + input = input.substr(b+9); + e=input.find("}]"); + if (architecture_obj!=NULL) architecture_obj->parseObjectArray(input); + +} + + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Utility +////////////////////////////////////////////////////////////////////////////////////////////////// + +bool is_file_exist(const std::string& fileName){ + std::ifstream infile(fileName); + return infile.good(); +} + +void memorySizePrint(size_t bytes){ + if (bytes<512){ + std::cout<& v){ + std::cout<<"["<0) std::cout<1){ + for (int i=1;i& v){ + std::cout<<"["; + if (v.size()>0) std::cout<1){ + for (int i=1;i veci(int n, ...){ + std::vector v; + if (n==0) return v; + va_list ap; + va_start(ap, n); + for(int i = 0; i < n; i++) { + v.push_back(va_arg(ap, int)); + } + va_end(ap); + return v; +} + +std::vector vecs(int n, ...){ + std::vector v; + if (n==0) return v; + va_list ap; + va_start(ap, n); + for(int i = 0; i < n; i++) { + v.push_back(std::string(va_arg(ap, char*))); + } + va_end(ap); + return v; +} + +std::vector getStringVector(std::string input){ + std::vector ret; + while (input.size()>0){ + int e = input.find(","); + if (e==std::string::npos){ + e = input.size(); + } + std::string first = input.substr(0,e); + ret.push_back(first); + if(e+1 > getIntVectorVector(std::string input){ + //remove all space + input.erase(remove_if(input.begin(), input.end(), (int(*)(int))isspace), input.end()); + + std::vector > ret; + while (input.size()>0){ + int e; + if (input[0]=='['){ + ret.resize(ret.size()+1); + e=0; + }else if (input[0]==','){ + e=0; + }else if (input[0]==']'){ + e=0; + }else{ + e = input.find(","); + if (e==std::string::npos){ + e = input.size(); + } + int f = input.find("]"); + if (f==std::string::npos){ + f = input.size(); + } + e = min(e,f); + std::string first = input.substr(0,e); + ret[ret.size()-1].push_back(stoi(first)); + } + if(e+1& dim){ + size_t res = 1; + for (int i=0;i& dim){ + size_t res = 1; + for (int i=1;i& dim){ + size_t res = 1; + for (int i=2;i& dimA, const std::vector& dimB){ + if (dimA.size()!=dimB.size()) return false; + for (int i=0;i& dimA, const std::vector& dimB){ + if (dimA.size()!=dimB.size()) return false; + if (dimA[0]!=dimB[0]) return false; + for (int i=2;i0){ + std::cout<<" checkNaN result: "< randperm(size_t n, std::mt19937& rng){ + std::vector v(n); + for (size_t i=0;i +std::vector sort_indexes(const std::vector &v) { + // initialize original index locations + std::vector idx(v.size()); + for (size_t i = 0; i != idx.size(); ++i) idx[i] = i; + // sort indexes based on comparing values in v + std::sort(idx.begin(), idx.end(), [&v](size_t i1, size_t i2) {return v[i1] < v[i2];}); + return idx; +} + +std::string int_to_str(const int i) { + std::ostringstream s; + s << i; + return s.str(); +} + +////////////////////////////////////////////////////////////////////////////////////////////////// +// CUDA kernels +////////////////////////////////////////////////////////////////////////////////////////////////// + + +#define CUDA_NUM_THREADS 512 + +#define MAX_NUM_BLOCKS 2880 + +inline int CUDA_GET_BLOCKS(const size_t N) { + return min(MAX_NUM_BLOCKS, int((N + size_t(CUDA_NUM_THREADS) - 1) / CUDA_NUM_THREADS)); +} + +inline size_t CUDA_GET_LOOPS(const size_t N) { + size_t total_threads = CUDA_GET_BLOCKS(N)*CUDA_NUM_THREADS; + return (N + total_threads -1)/ total_threads; +} + +__global__ void Accuracy_MultinomialLogistic( + size_t CUDA_NUM_LOOPS, size_t N, int C, int M, size_t wN, + const StorageT *pred, const StorageT *label, const StorageT *weight, + const StorageT *weightTensor, StorageT *loss) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * + (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N, idxBase + CUDA_NUM_LOOPS); ++idx) { + int l = int(GPUStorage2ComputeT(label[idx])); + int baseID = (idx / M) * C * M + idx % M; + int elementID = baseID + l * M; + ComputeT prob = GPUStorage2ComputeT(pred[elementID]); + loss[idx] = GPUCompute2StorageT(1); + for (int d = 0; d < C; ++d) { + if (GPUStorage2ComputeT(pred[baseID + d * M]) > prob) { + loss[idx] = GPUCompute2StorageT(0); + } + } + if (weight != NULL) { + loss[idx] = GPUCompute2StorageT(GPUStorage2ComputeT(loss[idx]) * + GPUStorage2ComputeT(weight[l])); + } + if (weightTensor != NULL) { + loss[idx] = GPUCompute2StorageT(GPUStorage2ComputeT(loss[idx]) * + GPUStorage2ComputeT( + weightTensor[idx % wN])); + } + } +} + +__global__ void Loss_MultinomialLogistic( + size_t CUDA_NUM_LOOPS, size_t N, int C, int M, size_t wN, + const StorageT* pred, const StorageT* label, const StorageT* weight, + const StorageT *weightTensor, StorageT *loss) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * + (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N, idxBase + CUDA_NUM_LOOPS); ++idx) { + int l = int(GPUStorage2ComputeT(label[idx])); + int offset = l * M + (idx % M); + int elementID = (idx / M) * C * M + offset; + ComputeT prob = max(GPUStorage2ComputeT(pred[elementID]), ComputeT_MIN); + ComputeT res = log(prob); + if (weight != NULL) res *= GPUStorage2ComputeT(weight[l]); + if (weightTensor != NULL) + res *= GPUStorage2ComputeT(weightTensor[elementID % wN]); + loss[idx] = GPUCompute2StorageT(res); + } +} + +__global__ void LossGrad_MultinomialLogistic( + size_t CUDA_NUM_LOOPS, size_t N, int C, int M, size_t wN, ComputeT scale, + const StorageT *pred, const StorageT *label, const StorageT *weight, + const StorageT *weightTensor, StorageT *diff) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * + (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N, idxBase + CUDA_NUM_LOOPS); ++idx) { + int l = int(GPUStorage2ComputeT(label[idx])); + int offset = l * M + (idx % M); + int elementID = (idx / M) * C * M + offset; + ComputeT prob = max(GPUStorage2ComputeT(pred[elementID]), ComputeT_MIN); + if (weight != NULL) scale *= GPUStorage2ComputeT(weight[l]); + if (weightTensor != NULL) + scale *= GPUStorage2ComputeT(weightTensor[elementID % wN]); + diff[elementID] = GPUCompute2StorageT( + GPUStorage2ComputeT(diff[elementID]) + scale / prob); + } +} + +// for numerical stability: http://freemind.pluskid.org/machine-learning/softmax-vs-softmax-loss-numerical-stability/ +__global__ void LossGrad_MultinomialLogistic_StableSoftmax( + size_t CUDA_NUM_LOOPS, size_t N, int C, int M, size_t wN, ComputeT scale, + const StorageT *pred, const StorageT *label, const StorageT *weight, + const StorageT *weightTensor, StorageT *diff) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * + (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N, idxBase + CUDA_NUM_LOOPS); ++idx) { + int l = int(GPUStorage2ComputeT(label[idx])); + int modM = idx % M; + int baseID = (idx / M) * C * M + modM; + int elementID = baseID + l * M; + + if (weight != NULL) { + scale *= GPUStorage2ComputeT(weight[l]); + } + + if (weightTensor == NULL) { + for (int d = 0; d < C; ++d) { + int k = baseID + d * M; + diff[k] = GPUCompute2StorageT(GPUStorage2ComputeT(diff[k]) + + scale * + GPUStorage2ComputeT(pred[k])); + } + diff[elementID] = GPUCompute2StorageT( + GPUStorage2ComputeT(diff[elementID]) - scale); + } else { + for (int d = 0; d < C; ++d) { + int k = baseID + d * M; + diff[k] = GPUCompute2StorageT(GPUStorage2ComputeT(diff[k]) + + scale * + GPUStorage2ComputeT(pred[k]) * + GPUStorage2ComputeT( + weightTensor[k % wN])); + } + diff[elementID] = GPUCompute2StorageT( + GPUStorage2ComputeT(diff[elementID]) - + scale * GPUStorage2ComputeT(weightTensor[elementID % wN])); + } + } +} + +__global__ void Loss_SmoothL1(size_t CUDA_NUM_LOOPS, size_t N, + const StorageT *pred, const StorageT *target, + const StorageT *weight, StorageT *loss) { + // diff = f( weight * (pred - target) ) + // f(x) = 0.5 * x^2 if |x| < 1 + // |x| - 0.5 otherwise + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * + (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N, idxBase + CUDA_NUM_LOOPS); ++idx) { + + ComputeT val = + GPUStorage2ComputeT(pred[idx]) - GPUStorage2ComputeT(target[idx]); + if (weight != NULL) val *= GPUStorage2ComputeT(weight[idx]); + + ComputeT abs_val = abs(val); + if (abs_val < 1) { + loss[idx] = GPUCompute2StorageT(0.5 * val * val); + } else { + loss[idx] = GPUCompute2StorageT(abs_val - 0.5); + } + } +} + +__global__ void Loss_EuclideanSSE(size_t CUDA_NUM_LOOPS, size_t N, + const StorageT *pred, const StorageT *target, + const StorageT *weight, StorageT *loss) { + // diff = f( weight * (pred - target) ) + // f(x) = 0.5 * x^2 + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * + (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N, idxBase + CUDA_NUM_LOOPS); ++idx) { + + ComputeT val = + GPUStorage2ComputeT(pred[idx]) - GPUStorage2ComputeT(target[idx]); + if (weight != NULL) val *= GPUStorage2ComputeT(weight[idx]); + + loss[idx] = GPUCompute2StorageT(0.5 * val * val); + } +} + +__global__ void LossGrad_SmoothL1( + size_t CUDA_NUM_LOOPS, size_t N, ComputeT scale, const StorageT *pred, + const StorageT *target, const StorageT *weight, StorageT *diff) { + // diff = scale * f'( weight * (pred - target) ) + // f'(x) = x if |x| < 1 + // = sign(x) otherwise + + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * + (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N, idxBase + CUDA_NUM_LOOPS); ++idx) { + + ComputeT val = + GPUStorage2ComputeT(pred[idx]) - GPUStorage2ComputeT(target[idx]); + if (weight != NULL) val *= GPUStorage2ComputeT(weight[idx]); + + ComputeT abs_val = abs(val); + if (abs_val < 1) { + diff[idx] = GPUCompute2StorageT( + GPUStorage2ComputeT(diff[idx]) + scale * val); + } else { + diff[idx] = GPUCompute2StorageT(GPUStorage2ComputeT(diff[idx]) + + scale * ((ComputeT(0) < val) - + (val < ComputeT(0)))); + } + } +} + +__global__ void LossGrad_EuclideanSSE( + size_t CUDA_NUM_LOOPS, size_t N, ComputeT scale, const StorageT *pred, + const StorageT *target, const StorageT *weight, StorageT *diff) { + // diff = scale * f'( weight * (pred - target) ) + // f'(x) = x + + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * + (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N, idxBase + CUDA_NUM_LOOPS); ++idx) { + + ComputeT val = + GPUStorage2ComputeT(pred[idx]) - GPUStorage2ComputeT(target[idx]); + if (weight != NULL) val *= GPUStorage2ComputeT(weight[idx]); + + diff[idx] = GPUCompute2StorageT(GPUStorage2ComputeT(diff[idx]) + scale * val); + } +} + +__global__ void Loss_Contrastive( + size_t CUDA_NUM_LOOPS, size_t N, int C, ComputeT margin, const StorageT *a, + const StorageT *b, const StorageT *y, StorageT *loss) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * + (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N, idxBase + CUDA_NUM_LOOPS); ++idx) { + ComputeT d = 0.0; + for (int c = 0; c < C; ++c) { + int i = idx * C + c; + ComputeT d_i = + GPUStorage2ComputeT(a[i]) - GPUStorage2ComputeT(b[i]); + d += d_i * d_i; + } + ComputeT y_n = GPUStorage2ComputeT(y[idx]); + ComputeT p = max(margin - sqrt(d), ComputeT(0)); + loss[idx] = GPUCompute2StorageT( + ComputeT(0.5) * (y_n * d + (ComputeT(1) - y_n) * p * p)); + } +} + +__global__ void LossGrad_Contrastive( + size_t CUDA_NUM_LOOPS, size_t N, int C, ComputeT margin, ComputeT scale, + const StorageT *a, const StorageT *b, const StorageT *y, StorageT *a_diff, + StorageT *b_diff) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * + (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N, idxBase + CUDA_NUM_LOOPS); ++idx) { + if ((int) (GPUStorage2ComputeT(y[idx]))) { + for (int c = 0; c < C; ++c) { + int i = idx * C + c; + ComputeT diff_i = + GPUStorage2ComputeT(a[i]) - GPUStorage2ComputeT(b[i]); + + ComputeT beta = scale * diff_i; + a_diff[i] = GPUCompute2StorageT( + GPUStorage2ComputeT(a_diff[i]) + beta); + b_diff[i] = GPUCompute2StorageT( + GPUStorage2ComputeT(b_diff[i]) - beta); + } + } else { + ComputeT dist_sq = 0.0; + for (int c = 0; c < C; ++c) { + int i = idx * C + c; + ComputeT diff_i = + GPUStorage2ComputeT(a[i]) - GPUStorage2ComputeT(b[i]); + dist_sq += diff_i * diff_i; + } + ComputeT dist = sqrt(dist_sq); + ComputeT mdist = margin - dist; + + if (mdist > 0.0) { + for (int c = 0; c < C; ++c) { + int i = idx * C + c; + ComputeT diff_i = + GPUStorage2ComputeT(a[i]) - GPUStorage2ComputeT(b[i]); + ComputeT beta = + -scale * mdist / (dist + ComputeT(1e-4)) * diff_i; + a_diff[i] = GPUCompute2StorageT( + GPUStorage2ComputeT(a_diff[i]) + beta); + b_diff[i] = GPUCompute2StorageT( + GPUStorage2ComputeT(b_diff[i]) - beta); + } + } + } + } +} + + +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const half* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(__half2float(pIn[idx])) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(__half2float(pIn[idx])) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const float* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const double* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const uint8_t* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const uint16_t* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const uint32_t* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const uint64_t* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const int8_t* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const int16_t* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const int32_t* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const int64_t* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const char* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} +__global__ void Kernel_convert_to_StorageT_subtract(size_t CUDA_NUM_LOOPS, size_t N, size_t sizeofitem, const bool* pIn, const StorageT* pMean, StorageT* pOut) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); if (idxBase >= N) return; + if (pMean==NULL) for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) ); + else for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ) pOut[idx] = GPUCompute2StorageT( ComputeT(pIn[idx]) - GPUStorage2ComputeT(pMean[idx % sizeofitem]) ); +} + +__global__ void Kernel_set_one_hot(size_t CUDA_NUM_LOOPS, size_t N, StorageT* GPUdst, size_t idx2hot){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + GPUdst[idx] = GPUCompute2StorageT( ComputeT(idx == idx2hot) ); + } +} + +void GPU_set_one_hot(size_t N, StorageT* GPUdst, size_t idx2hot){ + Kernel_set_one_hot<<>>(CUDA_GET_LOOPS(N),N,GPUdst,idx2hot); + checkCUDA(__LINE__,cudaGetLastError()); +} + +__global__ void Kernel_set_value(size_t CUDA_NUM_LOOPS, size_t N, StorageT* GPUdst, StorageT value){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + GPUdst[idx] = value; + } +} + +void GPU_set_value(size_t N, StorageT* GPUdst, StorageT value){ + Kernel_set_value<<>>(CUDA_GET_LOOPS(N),N,GPUdst,value); + checkCUDA(__LINE__,cudaGetLastError()); +} + +void GPU_set_ones(size_t N, StorageT* GPUdst){ + GPU_set_value(N, GPUdst, CPUCompute2StorageT(1)); +} + +void GPU_set_zeros(size_t N, StorageT* GPUdst){ + GPU_set_value(N, GPUdst, CPUCompute2StorageT(0)); +} + +__global__ void Kernel_elementwise_multiplication(size_t CUDA_NUM_LOOPS, size_t N, StorageT* GPUdst, const StorageT* GPUsrcA, const StorageT* GPUsrcB){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + GPUdst[idx] = GPUCompute2StorageT( GPUStorage2ComputeT(GPUsrcA[idx]) * GPUStorage2ComputeT(GPUsrcB[idx])); + } +} + +void GPU_elementwise_multiplication(size_t N, StorageT* GPUdst, const StorageT* GPUsrcA, const StorageT* GPUsrcB){ + Kernel_elementwise_multiplication<<>>(CUDA_GET_LOOPS(N),N,GPUdst,GPUsrcA,GPUsrcB); + checkCUDA(__LINE__,cudaGetLastError()); +} + +__global__ void Kernel_elementwise_comparison(size_t CUDA_NUM_LOOPS, size_t N, StorageT* GPUdst, const StorageT* GPUsrcA, const StorageT* GPUsrcB){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + GPUdst[idx] = GPUCompute2StorageT(ComputeT(bool(GPUStorage2ComputeT(GPUdst[idx])) && (GPUStorage2ComputeT(GPUsrcA[idx]) == GPUStorage2ComputeT(GPUsrcB[idx])))); + } +} + +void GPU_elementwise_comparison(size_t N, StorageT* GPUdst, const StorageT* GPUsrcA, const StorageT* GPUsrcB){ + Kernel_elementwise_comparison<<>>(CUDA_GET_LOOPS(N),N,GPUdst,GPUsrcA,GPUsrcB); + //checkCUDA(__LINE__,cudaGetLastError()); +} + +__global__ void Kernel_copyGPUforward(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* in, StorageT* out, int sizeofitem_in, int sizeofitem_out, int offset){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + int out_base = idx*sizeofitem_out+offset; + int in_base = idx*sizeofitem_in; + for(int i=0;i>>(CUDA_GET_LOOPS(N),N,in,out,sizeofitem_in,sizeofitem_out,offset); +} + + +__global__ void Kernel_copyGPUbackward(size_t CUDA_NUM_LOOPS, size_t N, StorageT* in, const StorageT* out, int sizeofitem_in, int sizeofitem_out, int offset){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + int in_base = idx*sizeofitem_in; + int out_base = idx*sizeofitem_out+offset; + for(int i=0;i>>(CUDA_GET_LOOPS(N),N,in,out,sizeofitem_in,sizeofitem_out,offset); +} + +__global__ void Kernel_elementwise_acc(size_t CUDA_NUM_LOOPS, size_t N, StorageT* GPUdst, const StorageT* GPUsrc){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + GPUdst[idx] = GPUCompute2StorageT( GPUStorage2ComputeT(GPUdst[idx]) + GPUStorage2ComputeT(GPUsrc[idx]) ); + } +} + +__global__ void Kernel_ROIforward_2D(size_t CUDA_NUM_LOOPS, size_t N, StorageT* out, const StorageT* in, const StorageT* start, int od1, int od2, int od3, int id1, int id2, int id3){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t o = idxBase; o < min(N,idxBase+CUDA_NUM_LOOPS); ++o ){ + int n = (o / (od1*od2*od3)); + int o1 = (o / ( od2*od3)) % od1; + int o2 = (o / od3 ) % od2; + int o3 = (o ) % od3; + int i1 = o1 + ((int)(GPUStorage2ComputeT(start[n*3+0]))); + int i2 = o2 + ((int)(GPUStorage2ComputeT(start[n*3+1]))); + int i3 = o3 + ((int)(GPUStorage2ComputeT(start[n*3+2]))); + int i = i3 + ( i2 + ( i1 + n * id1 ) * id2 ) * id3; + out[o] = in[i]; + } +} + +__global__ void Kernel_ROIforward_3D(size_t CUDA_NUM_LOOPS, size_t N, StorageT* out, const StorageT* in, const StorageT* start, int od1, int od2, int od3, int od4, int id1, int id2, int id3, int id4){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t o = idxBase; o < min(N,idxBase+CUDA_NUM_LOOPS); ++o ){ + int n = (o / (od1*od2*od3*od4)); + int o1 = (o / ( od2*od3*od4)) % od1; + int o2 = (o / ( od3*od4)) % od2; + int o3 = (o / ( od4)) % od3; + int o4 = (o ) % od4; + int i1 = o1 + ((int)(GPUStorage2ComputeT(start[n*4+0]))); + int i2 = o2 + ((int)(GPUStorage2ComputeT(start[n*4+1]))); + int i3 = o3 + ((int)(GPUStorage2ComputeT(start[n*4+2]))); + int i4 = o4 + ((int)(GPUStorage2ComputeT(start[n*4+3]))); + int i = i4 + (i3 + ( i2 + ( i1 + n * id1 ) * id2 ) * id3 ) * id4; + out[o] = in[i]; + } +} + +__global__ void Kernel_ROIforward_4D(size_t CUDA_NUM_LOOPS, size_t N, StorageT* out, const StorageT* in, const StorageT* start, int od1, int od2, int od3, int od4, int od5, int id1, int id2, int id3, int id4, int id5){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t o = idxBase; o < min(N,idxBase+CUDA_NUM_LOOPS); ++o ){ + int n = (o / (od1*od2*od3*od4*od5)); + int o1 = (o / ( od2*od3*od4*od5)) % od1; + int o2 = (o / ( od3*od4*od5)) % od2; + int o3 = (o / ( od4*od5)) % od3; + int o4 = (o / ( od5)) % od4; + int o5 = (o ) % od5; + int i1 = o1 + ((int)(GPUStorage2ComputeT(start[n*5+0]))); + int i2 = o2 + ((int)(GPUStorage2ComputeT(start[n*5+1]))); + int i3 = o3 + ((int)(GPUStorage2ComputeT(start[n*5+2]))); + int i4 = o4 + ((int)(GPUStorage2ComputeT(start[n*5+3]))); + int i5 = o5 + ((int)(GPUStorage2ComputeT(start[n*5+4]))); + int i = i5 + (i4 + (i3 + ( i2 + ( i1 + n * id1 ) * id2 ) * id3 ) * id4) * id5; + out[o] = in[i]; + } +} + +__global__ void Kernel_ROIbackward_2D(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* out, StorageT* in, const StorageT* start, int od1, int od2, int od3, int id1, int id2, int id3){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t o = idxBase; o < min(N,idxBase+CUDA_NUM_LOOPS); ++o ){ + int n = (o / (od1*od2*od3)); + int o1 = (o / ( od2*od3)) % od1; + int o2 = (o / od3 ) % od2; + int o3 = (o ) % od3; + int i1 = o1 + ((int)(GPUStorage2ComputeT(start[n*3+0]))); + int i2 = o2 + ((int)(GPUStorage2ComputeT(start[n*3+1]))); + int i3 = o3 + ((int)(GPUStorage2ComputeT(start[n*3+2]))); + int i = i3 + ( i2 + ( i1 + n * id1 ) * id2 ) * id3; + in[i] = GPUCompute2StorageT( GPUStorage2ComputeT(in[i]) + GPUStorage2ComputeT(out[o]) ); + } +} + +__global__ void Kernel_ROIbackward_3D(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* out, StorageT* in, const StorageT* start, int od1, int od2, int od3, int od4, int id1, int id2, int id3, int id4){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t o = idxBase; o < min(N,idxBase+CUDA_NUM_LOOPS); ++o ){ + int n = (o / (od1*od2*od3*od4)); + int o1 = (o / ( od2*od3*od4)) % od1; + int o2 = (o / ( od3*od4)) % od2; + int o3 = (o / ( od4)) % od3; + int o4 = (o ) % od4; + int i1 = o1 + ((int)(GPUStorage2ComputeT(start[n*4+0]))); + int i2 = o2 + ((int)(GPUStorage2ComputeT(start[n*4+1]))); + int i3 = o3 + ((int)(GPUStorage2ComputeT(start[n*4+2]))); + int i4 = o4 + ((int)(GPUStorage2ComputeT(start[n*4+3]))); + int i = i4 + (i3 + ( i2 + ( i1 + n * id1 ) * id2 ) * id3 ) * id4; + in[i] = GPUCompute2StorageT( GPUStorage2ComputeT(in[i]) + GPUStorage2ComputeT(out[o]) ); + } +} + +__global__ void Kernel_ROIbackward_4D(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* out, StorageT* in, const StorageT* start, int od1, int od2, int od3, int od4, int od5, int id1, int id2, int id3, int id4, int id5){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t o = idxBase; o < min(N,idxBase+CUDA_NUM_LOOPS); ++o ){ + int n = (o / (od1*od2*od3*od4*od5)); + int o1 = (o / ( od2*od3*od4*od5)) % od1; + int o2 = (o / ( od3*od4*od5)) % od2; + int o3 = (o / ( od4*od5)) % od3; + int o4 = (o / ( od5)) % od4; + int o5 = (o ) % od5; + int i1 = o1 + ((int)(GPUStorage2ComputeT(start[n*5+0]))); + int i2 = o2 + ((int)(GPUStorage2ComputeT(start[n*5+1]))); + int i3 = o3 + ((int)(GPUStorage2ComputeT(start[n*5+2]))); + int i4 = o4 + ((int)(GPUStorage2ComputeT(start[n*5+3]))); + int i5 = o5 + ((int)(GPUStorage2ComputeT(start[n*5+4]))); + int i = i5 + (i4 + (i3 + ( i2 + ( i1 + n * id1 ) * id2 ) * id3 ) * id4) * id5; + in[i] = GPUCompute2StorageT( GPUStorage2ComputeT(in[i]) + GPUStorage2ComputeT(out[o]) ); + } +} + +__global__ void CoeffElementWiseSumReplace(size_t CUDA_NUM_LOOPS, size_t N, const ComputeT coeff, const StorageT* coeff_data, const size_t num_offset, const size_t dim, const StorageT* in, StorageT* out) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + const ComputeT final_coeff = coeff_data ? ( GPUStorage2ComputeT(coeff_data[num_offset + idx / dim]) * coeff) : coeff; + out[idx] = GPUCompute2StorageT( GPUStorage2ComputeT(in[idx]) * final_coeff ); + } +} + +__global__ void CoeffElementWiseSumAccumulate(size_t CUDA_NUM_LOOPS, size_t N, const ComputeT coeff, const StorageT* coeff_data, const size_t num_offset, const size_t dim, const StorageT* in, StorageT* out) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + const ComputeT final_coeff = coeff_data ? ( GPUStorage2ComputeT(coeff_data[num_offset + idx / dim]) * coeff) : coeff; + out[idx] = GPUCompute2StorageT(GPUStorage2ComputeT(out[idx]) + GPUStorage2ComputeT(in[idx]) * final_coeff ); + } +} + + +/* ---------------------------------------------------------------------------- + * The following four functions are inspired by Ross Girshick's Fast-RCNN code, + * which is copyrighted by Microsoft under an MIT License. + * + * Project page: https://github.com/rbgirshick/fast-rcnn + * License page: https://github.com/rbgirshick/fast-rcnn/blob/master/LICENSE + * ---------------------------------------------------------------------------- + */ +__global__ void Kernel_ROIPoolForward_2D(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* in_data, const StorageT* in_rois, StorageT* out_data, size_t* argmax_data, const ComputeT spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + + for (size_t index = idxBase; index < min(N,idxBase+CUDA_NUM_LOOPS); ++index ){ + // (n, c, ph, pw) is an element in the pooled output + int pw = (index) % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = (index / pooled_width / pooled_height / channels); + + int roi_5n = n*5; + int roi_batch_ind = GPUStorage2ComputeT(in_rois[roi_5n+0]); + int roi_start_h = ::round(GPUStorage2ComputeT(in_rois[roi_5n+1]) * spatial_scale); + int roi_end_h = ::round(GPUStorage2ComputeT(in_rois[roi_5n+2]) * spatial_scale); + int roi_start_w = ::round(GPUStorage2ComputeT(in_rois[roi_5n+3]) * spatial_scale); + int roi_end_w = ::round(GPUStorage2ComputeT(in_rois[roi_5n+4]) * spatial_scale); + + // Force malformed ROIs to be 1x1 + int roi_width = max(roi_end_w - roi_start_w + 1, 1); + int roi_height = max(roi_end_h - roi_start_h + 1, 1); + ComputeT bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + ComputeT bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + int hstart = static_cast(floor(static_cast(ph) * bin_size_h)); + int wstart = static_cast(floor(static_cast(pw) * bin_size_w)); + int hend = static_cast(ceil(static_cast(ph + 1) * bin_size_h)); + int wend = static_cast(ceil(static_cast(pw + 1) * bin_size_w)); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart + roi_start_h, 0), height); + hend = min(max(hend + roi_start_h, 0), height); + wstart = min(max(wstart + roi_start_w, 0), width); + wend = min(max(wend + roi_start_w, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Define an empty pooling region to be zero + ComputeT maxval = is_empty ? 0 : -FLT_MAX; + // If nothing is pooled, argmax = -1 causes nothing to be backprop'd + size_t maxidx = SIZE_MAX; + + size_t in_offset = (roi_batch_ind * channels + c) * height * width; + + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + size_t in_index = in_offset + h * width + w; + ComputeT v = GPUStorage2ComputeT(in_data[in_index]); + if (v > maxval) { + maxval = v; + maxidx = in_index; + } + } + } + out_data[index] = GPUCompute2StorageT(maxval); + if (argmax_data!=NULL) argmax_data[index] = maxidx; + + } +} + +__global__ void Kernel_ROIPoolForward_3D(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* in_data, const StorageT* in_rois, StorageT* out_data, size_t* argmax_data, const ComputeT spatial_scale, const int channels, const int depth, const int height, const int width, const int pooled_depth, const int pooled_height, const int pooled_width){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t index = idxBase; index < min(N,idxBase+CUDA_NUM_LOOPS); ++index ){ + + // (n, c, pd, ph, pw) is an element in the pooled output + int pw = (index) % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int pd = (index / pooled_width / pooled_height) % pooled_depth; + int c = (index / pooled_width / pooled_height / pooled_depth ) % channels; + int n = (index / pooled_width / pooled_height / pooled_depth / channels); + + int roi_7n = n * 7; + int roi_batch_ind = GPUStorage2ComputeT(in_rois[roi_7n+0]); + int roi_start_d = ::round(GPUStorage2ComputeT(in_rois[roi_7n+1]) * spatial_scale); + int roi_end_d = ::round(GPUStorage2ComputeT(in_rois[roi_7n+2]) * spatial_scale); + int roi_start_h = ::round(GPUStorage2ComputeT(in_rois[roi_7n+3]) * spatial_scale); + int roi_end_h = ::round(GPUStorage2ComputeT(in_rois[roi_7n+4]) * spatial_scale); + int roi_start_w = ::round(GPUStorage2ComputeT(in_rois[roi_7n+5]) * spatial_scale); + int roi_end_w = ::round(GPUStorage2ComputeT(in_rois[roi_7n+6]) * spatial_scale); + + + // Force malformed ROIs to be 1x1 + int roi_depth = max(roi_end_d - roi_start_d + 1, 1); + int roi_width = max(roi_end_w - roi_start_w + 1, 1); + int roi_height = max(roi_end_h - roi_start_h + 1, 1); + + ComputeT bin_size_d = static_cast(roi_depth) / static_cast(pooled_depth); + ComputeT bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + ComputeT bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + + int dstart = static_cast(floor(static_cast(pd) * bin_size_d)); + int hstart = static_cast(floor(static_cast(ph) * bin_size_h)); + int wstart = static_cast(floor(static_cast(pw) * bin_size_w)); + int dend = static_cast(ceil(static_cast(pd + 1) * bin_size_d)); + int hend = static_cast(ceil(static_cast(ph + 1) * bin_size_h)); + int wend = static_cast(ceil(static_cast(pw + 1) * bin_size_w)); + + // Add roi offsets and clip to input boundaries + + dstart = min(max(dstart + roi_start_d, 0), depth); + dend = min(max(dend + roi_start_d, 0), depth); + hstart = min(max(hstart + roi_start_h, 0), height); + hend = min(max(hend + roi_start_h, 0), height); + wstart = min(max(wstart + roi_start_w, 0), width); + wend = min(max(wend + roi_start_w, 0), width); + bool is_empty = (dend <= dstart) || (hend <= hstart) || (wend <= wstart); + + // Define an empty pooling region to be zero + ComputeT maxval = is_empty ? 0 : -FLT_MAX; + // If nothing is pooled, argmax = -1 causes nothing to be backprop'd + size_t maxidx = SIZE_MAX; + size_t in_offset = (roi_batch_ind * channels + c) * depth * height * width; + + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + size_t in_index = in_offset + d * height * width + h * width + w; + ComputeT v = GPUStorage2ComputeT(in_data[in_index]); + if (v > maxval) { + maxval = v; + maxidx = in_index; + } + } + } + } + out_data[index] = GPUCompute2StorageT(maxval); + if (argmax_data!=NULL) argmax_data[index] = maxidx; + } +} + +__global__ void Kernel_ROIPoolBackward_2D(size_t CUDA_NUM_LOOPS, size_t N, StorageT* in_diff, const StorageT* in_rois, const StorageT* out_diff, const size_t* argmax_data, const ComputeT spatial_scale, const int num_rois, const int channels, const int height, const int width, const int pooled_height, const int pooled_width) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t index = idxBase; index < min(N,idxBase+CUDA_NUM_LOOPS); ++index ){ + + // (n, c, h, w) coords in in data + int w = index % width; + int h = (index / width) % height; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + + ComputeT gradient = GPUStorage2ComputeT(in_diff[index]); + // Accumulate gradient over all ROIs that pooled this element + for (int roi_n = 0; roi_n < num_rois; ++roi_n) { + int roi_5n = roi_n*5; + int roi_batch_ind = (int)(GPUStorage2ComputeT(in_rois[roi_5n+0])); + // Skip if ROI's batch index doesn't match n + if (n != roi_batch_ind) { + continue; + } + + int roi_start_h = ::round(GPUStorage2ComputeT(in_rois[roi_5n+1]) * spatial_scale); + int roi_end_h = ::round(GPUStorage2ComputeT(in_rois[roi_5n+2]) * spatial_scale); + int roi_start_w = ::round(GPUStorage2ComputeT(in_rois[roi_5n+3]) * spatial_scale); + int roi_end_w = ::round(GPUStorage2ComputeT(in_rois[roi_5n+4]) * spatial_scale); + + // Skip if ROI doesn't include (h, w) + const bool in_roi = (w >= roi_start_w && w <= roi_end_w && h >= roi_start_h && h <= roi_end_h); + if (!in_roi) { + continue; + } + + size_t offset = (roi_n * channels + c) * pooled_height * pooled_width; + + // Compute feasible set of pooled units that could have pooled + // this in unit + + // Force malformed ROIs to be 1x1 + int roi_width = max(roi_end_w - roi_start_w + 1, 1); + int roi_height = max(roi_end_h - roi_start_h + 1, 1); + + ComputeT bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + ComputeT bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + int phstart = floor(static_cast(h - roi_start_h) / bin_size_h); + int phend = ceil(static_cast(h - roi_start_h + 1) / bin_size_h); + int pwstart = floor(static_cast(w - roi_start_w) / bin_size_w); + int pwend = ceil(static_cast(w - roi_start_w + 1) / bin_size_w); + + phstart = min(max(phstart, 0), pooled_height); + phend = min(max(phend, 0), pooled_height); + pwstart = min(max(pwstart, 0), pooled_width); + pwend = min(max(pwend, 0), pooled_width); + + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + size_t out_index = ph * pooled_width + pw; + if (argmax_data[offset + out_index] == (h * width + w)) { + gradient += GPUStorage2ComputeT(out_diff[offset + out_index]); + } + } + } + } + in_diff[index] = GPUCompute2StorageT(gradient); + } +} + +__global__ void Kernel_ROIPoolBackward_3D(size_t CUDA_NUM_LOOPS, size_t N, StorageT* in_diff, const StorageT* in_rois, const StorageT* out_diff, const size_t* argmax_data, const ComputeT spatial_scale, const int num_rois, const int channels, const int depth, const int height, const int width, const int pooled_depth, const int pooled_height, const int pooled_width) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t index = idxBase; index < min(N,idxBase+CUDA_NUM_LOOPS); ++index ){ + + // (n, c, h, w) coords in in data + int w = index % width; + int h = (index / width) % height; + int d = (index / width / height) % depth; + int c = (index / width / height / depth) % channels; + int n = index / width / height / depth / channels; + + ComputeT gradient = GPUStorage2ComputeT(in_diff[index]); + // Accumulate gradient over all ROIs that pooled this element + for (int roi_n = 0; roi_n < num_rois; ++roi_n) { + int roi_7n = roi_n*7; + int roi_batch_ind = (int)(GPUStorage2ComputeT(in_rois[roi_7n+0])); + // Skip if ROI's batch index doesn't match n + if (n != roi_batch_ind) { + continue; + } + + int roi_start_d = ::round(GPUStorage2ComputeT(in_rois[roi_7n+1]) * spatial_scale); + int roi_end_d = ::round(GPUStorage2ComputeT(in_rois[roi_7n+2]) * spatial_scale); + int roi_start_h = ::round(GPUStorage2ComputeT(in_rois[roi_7n+3]) * spatial_scale); + int roi_end_h = ::round(GPUStorage2ComputeT(in_rois[roi_7n+4]) * spatial_scale); + int roi_start_w = ::round(GPUStorage2ComputeT(in_rois[roi_7n+5]) * spatial_scale); + int roi_end_w = ::round(GPUStorage2ComputeT(in_rois[roi_7n+6]) * spatial_scale); + + // Skip if ROI doesn't include (h, w) + const bool in_roi = (w >= roi_start_w && w <= roi_end_w && h >= roi_start_h && h <= roi_end_h && d >= roi_start_d && d <= roi_end_d); + if (!in_roi) { + continue; + } + + size_t offset = (roi_n * channels + c) * pooled_depth * pooled_height * pooled_width; + + // Compute feasible set of pooled units that could have pooled + // this in unit + + // Force malformed ROIs to be 1x1 + int roi_width = max(roi_end_w - roi_start_w + 1, 1); + int roi_height = max(roi_end_h - roi_start_h + 1, 1); + int roi_depth = max(roi_end_d - roi_start_d + 1, 1); + + ComputeT bin_size_d = static_cast(roi_depth) / static_cast(pooled_depth); + ComputeT bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + ComputeT bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + int pdstart = floor(static_cast(d - roi_start_d) / bin_size_d); + int pdend = ceil(static_cast(d - roi_start_d + 1) / bin_size_d); + int phstart = floor(static_cast(h - roi_start_h) / bin_size_h); + int phend = ceil(static_cast(h - roi_start_h + 1) / bin_size_h); + int pwstart = floor(static_cast(w - roi_start_w) / bin_size_w); + int pwend = ceil(static_cast(w - roi_start_w + 1) / bin_size_w); + + pdstart = min(max(pdstart, 0), pooled_depth); + pdend = min(max(pdend, 0), pooled_depth); + phstart = min(max(phstart, 0), pooled_height); + phend = min(max(phend, 0), pooled_height); + pwstart = min(max(pwstart, 0), pooled_width); + pwend = min(max(pwend, 0), pooled_width); + + for (int pd = pdstart; pd < pdend; ++pd) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + size_t out_index = (pd * pooled_height + ph) * pooled_width + pw; + if (argmax_data[offset + out_index] == ((d * height + h) * width + w)) { + gradient += GPUStorage2ComputeT(out_diff[offset+out_index]); + } + } + } + } + } + in_diff[index] = GPUCompute2StorageT(gradient); + } +} + +__global__ void Kernel_bsa2b(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* a, StorageT* b){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + b[idx] = GPUCompute2StorageT(GPUStorage2ComputeT(b[idx]) - GPUStorage2ComputeT(a[idx])); + } +} + +void bsa2b(size_t N, const StorageT* a, StorageT* b){ + Kernel_bsa2b<<>>(CUDA_GET_LOOPS(N),N,a,b); +} + +__global__ void Kernel_update_SGDL1(size_t CUDA_NUM_LOOPS, size_t N, int nNets, ComputeT decay, ComputeT momentum, ComputeT lr, const StorageT* weights, StorageT* gradients){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + ComputeT w = GPUStorage2ComputeT(weights[idx]); + ComputeT h = GPUStorage2ComputeT(gradients[idx]); + ComputeT g; + if (w>0) g = decay; + else if (w<0) g = -decay; + else g = 0; + for (int k=1; k= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + ComputeT w = GPUStorage2ComputeT(weights[idx]); + ComputeT h = GPUStorage2ComputeT(gradients[idx]); + ComputeT g = decay * w; // L2 regularization + for (int k=1; k= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + ComputeT w = GPUStorage2ComputeT(weights[idx]); + size_t h_idx = N*(nNets+1)+idx; + ComputeT h = GPUStorage2ComputeT(gradients[h_idx]); + size_t h2_idx = N*(nNets+2)+idx; + ComputeT h2 = GPUStorage2ComputeT(gradients[h2_idx]); + + ComputeT g; + if (w>0) g = decay; + else if (w<0) g = -decay; + else g = 0; + for (int k=1; k= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + ComputeT w = GPUStorage2ComputeT(weights[idx]); + size_t h_idx = N*(nNets+1)+idx; + ComputeT h = GPUStorage2ComputeT(gradients[h_idx]); + size_t h2_idx = N*(nNets+2)+idx; + ComputeT h2 = GPUStorage2ComputeT(gradients[h2_idx]); + + ComputeT g = decay * w; // L2 regularization + for (int k=1; k= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + ComputeT w = GPUStorage2ComputeT(weights[idx]); + ComputeT u = GPUStorage2ComputeT(gradients[idx]); + size_t h_idx = N*(nNets+1)+idx; + ComputeT h = GPUStorage2ComputeT(gradients[h_idx]); + ComputeT g; + if (w>0) g = decay; + else if (w<0) g = -decay; + else g = 0; + for (int k=1; k= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + ComputeT w = GPUStorage2ComputeT(weights[idx]); + ComputeT u = GPUStorage2ComputeT(gradients[idx]); + size_t h_idx = N*(nNets+1)+idx; + ComputeT h = GPUStorage2ComputeT(gradients[h_idx]); + ComputeT g = decay * w; // L2 regularization + for (int k=1; k= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + ComputeT w = GPUStorage2ComputeT(weights[idx]); + size_t h_idx = N*(nNets+1)+idx; + ComputeT h = GPUStorage2ComputeT(gradients[h_idx]); + size_t h2_idx = N*(nNets+2)+idx; + ComputeT h2 = GPUStorage2ComputeT(gradients[h2_idx]); + ComputeT g; + if (w>0) g = decay; + else if (w<0) g = -decay; + else g = 0; + for (int k=1; k= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + ComputeT w = GPUStorage2ComputeT(weights[idx]); + size_t h_idx = N*(nNets+1)+idx; + ComputeT h = GPUStorage2ComputeT(gradients[h_idx]); + size_t h2_idx = N*(nNets+2)+idx; + ComputeT h2 = GPUStorage2ComputeT(gradients[h2_idx]); + ComputeT g = decay * w; // L2 regularization + for (int k=1; k= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + ComputeT w = GPUStorage2ComputeT(weights[idx]); + size_t h_idx = N*(nNets+1)+idx; + ComputeT h = GPUStorage2ComputeT(gradients[h_idx]); + ComputeT g; + if (w>0) g = decay; + else if (w<0) g = -decay; + else g = 0; + for (int k=1; k= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + ComputeT w = GPUStorage2ComputeT(weights[idx]); + size_t h_idx = N*(nNets+1)+idx; + ComputeT h = GPUStorage2ComputeT(gradients[h_idx]); + ComputeT g = decay * w; // L2 regularization + for (int k=1; k= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + ComputeT w = GPUStorage2ComputeT(weights[idx]); + size_t h_idx = N*(nNets+1)+idx; + ComputeT h = GPUStorage2ComputeT(gradients[h_idx]); + ComputeT g; + if (w>0) g = decay; + else if (w<0) g = -decay; + else g = 0; + for (int k=1; k= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + ComputeT w = GPUStorage2ComputeT(weights[idx]); + size_t h_idx = N*(nNets+1)+idx; + ComputeT h = GPUStorage2ComputeT(gradients[h_idx]); + ComputeT g = decay * w; // L2 regularization + for (int k=1; k>>(CUDA_GET_LOOPS(N),N,nNets,decay,momentum,lr,weights,gradients); + else + Kernel_update_SGDL2<<>>(CUDA_GET_LOOPS(N),N,nNets,decay,momentum,lr,weights,gradients); + break; + case AdaDelta: + if (regularizer==L1) + Kernel_update_AdaDeltaL1<<>>(CUDA_GET_LOOPS(N),N,nNets,decay,momentum,delta,lr,weights,gradients); + else + Kernel_update_AdaDeltaL2<<>>(CUDA_GET_LOOPS(N),N,nNets,decay,momentum,delta,lr,weights,gradients); + break; + case AdaGrad: + if (regularizer==L1) + Kernel_update_AdaGradL1<<>>(CUDA_GET_LOOPS(N),N,nNets,decay,momentum,delta,lr,weights,gradients); + else + Kernel_update_AdaGradL2<<>>(CUDA_GET_LOOPS(N),N,nNets,decay,momentum,delta,lr,weights,gradients); + break; + case Adam: + if (regularizer==L1) + Kernel_update_AdamL1<<>>(CUDA_GET_LOOPS(N),N,nNets,decay,momentum,momentum2,delta,iter+1,lr,weights,gradients); + else + Kernel_update_AdamL2<<>>(CUDA_GET_LOOPS(N),N,nNets,decay,momentum,momentum2,delta,iter+1,lr,weights,gradients); + break; + case NAG: + if (regularizer==L1) + Kernel_update_NAGL1<<>>(CUDA_GET_LOOPS(N),N,nNets,decay,momentum,delta,lr,weights,gradients); + else + Kernel_update_NAGL2<<>>(CUDA_GET_LOOPS(N),N,nNets,decay,momentum,delta,lr,weights,gradients); + break; + case RMSprop: + if (regularizer==L1) + Kernel_update_RMSpropL1<<>>(CUDA_GET_LOOPS(N),N,nNets,decay,rms_decay,delta,lr,weights,gradients); + else + Kernel_update_RMSpropL2<<>>(CUDA_GET_LOOPS(N),N,nNets,decay,rms_decay,delta,lr,weights,gradients); + break; + } + checkCUDA(__LINE__,cudaGetLastError()); +} + +__global__ void Kernel_xpy(size_t CUDA_NUM_LOOPS, size_t N, const StorageT* x, StorageT* y){ + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ + y[idx] = GPUCompute2StorageT( GPUStorage2ComputeT(y[idx]) + GPUStorage2ComputeT(x[idx])); + } +} + +void xpy(size_t N, const StorageT* x, StorageT* y){ + Kernel_xpy<<>>(CUDA_GET_LOOPS(N),N,x,y); + checkCUDA(__LINE__,cudaGetLastError()); +} + +__global__ void Kernel_maxElement(size_t N, const StorageT *x, size_t* pMaxID, ComputeT* pMaxValue){ + const size_t idx = CUDA_NUM_THREADS * blockIdx.x + threadIdx.x; + if (idx > 0) return; + //printf("%d %f\n", 0, GPUStorage2ComputeT(x[0]) ); + ComputeT maxValue = GPUStorage2ComputeT(x[0]); + size_t maxID = 0; + for (size_t i=1;imaxValue){ + maxValue = GPUStorage2ComputeT(x[i]); + maxID = i; + } + //printf("%d %f %d\n", i, GPUStorage2ComputeT(x[i]), maxID); + } + if (pMaxID!=NULL) *pMaxID = maxID; + if (pMaxValue!=NULL) *pMaxValue = maxValue; +} + +void GPU_maxElement(size_t N, const StorageT *x, size_t* cpuMaxID, ComputeT* cpuMaxValue){ + size_t* gpuMaxID; cudaMalloc(&gpuMaxID, sizeof(size_t)); + ComputeT* gpuMaxValue; cudaMalloc(&gpuMaxValue, sizeof(ComputeT)); + + Kernel_maxElement<<<1,1>>>(N, x, gpuMaxID, gpuMaxValue); + + cudaMemcpy(cpuMaxID, gpuMaxID, sizeof(size_t), cudaMemcpyDeviceToHost); cudaFree(gpuMaxID); + cudaMemcpy(cpuMaxValue, gpuMaxValue, sizeof(ComputeT), cudaMemcpyDeviceToHost); cudaFree(gpuMaxValue); +} + +__global__ void Kernel_Hasum(size_t N, const half *x, int incx, float *result){ + const int i = CUDA_NUM_THREADS * blockIdx.x + threadIdx.x; + if (i > 0) return; + + float r = 0; + for (int i=0;i>>(n, x, incx, answer); + cudaMemcpy(result, answer, sizeof(float), cudaMemcpyDeviceToHost); + cudaFree(answer); + return CUBLAS_STATUS_SUCCESS; +} + +cublasStatus_t Hgemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const half *A, int lda, const half *B, int ldb, const float *beta, half *C, int ldc){ + return cublasSgemmEx(handle, transa, transb, m, n, k, alpha, A, CUBLAS_DATA_HALF, lda, B, CUBLAS_DATA_HALF, ldb, beta, C, CUBLAS_DATA_HALF, ldc); +} + + +////////////////////////////////////////////////////////////////////////////////////////////////// +// File format +////////////////////////////////////////////////////////////////////////////////////////////////// + +uint8_t typeID(std::type_index t){ + if (t==typeid(half)) return uint8_t(0); + if (t==typeid(float)) return uint8_t(1); + if (t==typeid(double)) return uint8_t(2); + if (t==typeid(uint8_t)) return uint8_t(3); + if (t==typeid(uint16_t)) return uint8_t(4); + if (t==typeid(uint32_t)) return uint8_t(5); + if (t==typeid(uint64_t)) return uint8_t(6); + if (t==typeid(int8_t)) return uint8_t(7); + if (t==typeid(int16_t)) return uint8_t(8); + if (t==typeid(int32_t)) return uint8_t(9); + if (t==typeid(int64_t)) return uint8_t(10); + if (t==typeid(char)) return uint8_t(11); + if (t==typeid(bool)) return uint8_t(12); + FatalError(__LINE__); return uint8_t(255); +} + +uint8_t readTypeID(std::string filename){ + FILE* fp = fopen(filename.c_str(),"rb"); + while (fp==NULL) { + std::cerr<<"readTypeID: fail to open file "< +class Tensor{ +public: + std::vector dim; + T* CPUmem; + std::string name; + + // compile will check if your time is not correct for writeGPU and readGPU + void writeGPU(T* GPUmem){ + cudaMemcpy(GPUmem, CPUmem, numel()*sizeof(T), cudaMemcpyHostToDevice); + }; + + void readGPU(T* GPUmem){ + cudaMemcpy(CPUmem, GPUmem, numel()*sizeof(T), cudaMemcpyDeviceToHost); + }; + + Tensor(): CPUmem(NULL){}; + + size_t numel(){ return marvin::numel(dim); }; + + size_t numBytes(){ return sizeof(T)*numel(); }; + + int numofitems(){ return dim[0]; }; + + size_t sizeofitem(){ return marvin::sizeofitem(dim); }; + + ~Tensor(){ + if (CPUmem!=NULL) delete[] CPUmem; + }; + + void initialize(T val){ + for (size_t i=0;i0){ + read_cnt = fread((void*)(name.data()), sizeof(char), lenName, fp); + if (read_cnt!=lenName) { std::cerr<<"Error at Tensor::readHeader: wrong data type. "<0){ + read_cnt = fread((void*)(&dim[0]), sizeof(int), nbDims, fp); + if (read_cnt!=nbDims) { std::cerr<<"Error at Tensor::readHeader: wrong data type. "<* floatTensor = new Tensor(fp); + this->dim = floatTensor->dim ; + this->name = floatTensor->name; + Malloc(batch_size); + for(size_t i=0; iCPUmem[i]); + memcpy(((half*)(CPUmem))+i,&v,sizeof(half)); + } + delete floatTensor; + }else if (myTypeid==typeID(typeid(float)) && fpTypeid==typeID(typeid(half))){ + fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); + Tensor* halfTensor = new Tensor(fp); + this->dim = halfTensor->dim ; + this->name = halfTensor->name; + Malloc(batch_size); + for(size_t i=0; iCPUmem[i]); + memcpy(((float*)(CPUmem))+i,&v,sizeof(float)); + } + delete halfTensor; + }else if (myTypeid==typeID(typeid(double)) && fpTypeid==typeID(typeid(float))){ + fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); + Tensor* floatTensor = new Tensor(fp); + this->dim = floatTensor->dim ; + this->name = floatTensor->name; + Malloc(batch_size); + for(size_t i=0; iCPUmem[i]); + memcpy(((double*)(CPUmem))+i,&v,sizeof(double)); + } + delete floatTensor; + }else if (myTypeid==typeID(typeid(float)) && fpTypeid==typeID(typeid(double))){ + fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); + Tensor* doubleTensor = new Tensor(fp); + this->dim = doubleTensor->dim ; + this->name = doubleTensor->name; + Malloc(batch_size); + for(size_t i=0; iCPUmem[i]); + memcpy(((float*)(CPUmem))+i,&v,sizeof(float)); + } + delete doubleTensor; + }else if (myTypeid==typeID(typeid(half)) && fpTypeid==typeID(typeid(double))){ + fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); + Tensor* doubleTensor = new Tensor(fp); + this->dim = doubleTensor->dim ; + this->name = doubleTensor->name; + Malloc(batch_size); + for(size_t i=0; iCPUmem[i])); + memcpy(((half*)(CPUmem))+i,&v,sizeof(half)); + } + delete doubleTensor; + }else if (myTypeid==typeID(typeid(float)) && fpTypeid==typeID(typeid(half))){ + fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); + Tensor* halfTensor = new Tensor(fp); + this->dim = halfTensor->dim ; + this->name = halfTensor->name; + Malloc(batch_size); + for(size_t i=0; iCPUmem[i])); + memcpy(((double*)(CPUmem))+i,&v,sizeof(double)); + } + delete halfTensor; + }else{ + std::cerr<<"Tensor conversion is not supported: from Type "<0){ + read_cnt = fread((void*)(name.data()), sizeof(char), lenName, fp); + if (read_cnt!=lenName) return NULL; + } + int nbDims; + read_cnt = fread((void*)(&nbDims), sizeof(int), 1, fp); + if (read_cnt!=1) return NULL; + dim.resize(nbDims); + if (nbDims>0){ + read_cnt = fread((void*)(&dim[0]), sizeof(int), nbDims, fp); + if (read_cnt!=nbDims) return NULL; + } + + size_t n = numel(); + Malloc(batch_size); + read_cnt = fread((void*)(CPUmem), sizeof(T), n, fp); + if (read_cnt!=n){ + delete [] CPUmem; + CPUmem = NULL; + return NULL; + } + } + + return CPUmem; + }; + + void Malloc(int batch_size){ + size_t n = numel(); + std::cout<<" "; memorySizePrint(n*sizeof(T)); std::cout< dim2write){ + uint8_t myTypeid = typeID(typeid(T)); + fwrite((void*)(&myTypeid), sizeof(uint8_t), 1, fp); + uint32_t typesizeof = uint32_t(sizeof(T)); + fwrite((void*)(&typesizeof), sizeof(uint32_t), 1, fp); + int lenName = name.size(); + fwrite((void*)(&lenName), sizeof(int), 1, fp); + if (lenName>0) fwrite((void*)(name.data()), sizeof(char), lenName, fp); + int nbDims = dim2write.size(); + fwrite((void*)(&nbDims), sizeof(int), 1, fp); + if (nbDims>0) fwrite((void*)(&dim2write[0]), sizeof(int), nbDims, fp); + if (ferror (fp)){ + std::cerr << "disk writing failed"<0){ + fwrite((void*)(CPUmem), sizeof(T), n, fp); + if (ferror (fp)){ + std::cerr << "disk writing failed" << std::endl; + FatalError(); + } + } + }; + + //support continuous write across many NdTensors + //write with header + void write(FILE* fp){ + writeHeader(fp,dim); + writeData(fp); + }; + + void write(std::string filename){ + FILE* fp = fopen(filename.c_str(),"wb"); + while (fp==NULL) { + std::cerr<<"Tensor::write: fail to open file "< dim_): dim(dim_){ CPUmem = new T [numel()]; }; + + Tensor(std::vector dim_, T initValue): dim(dim_){ + int n = numel(); + CPUmem = new T [n]; + if (initValue == T(0)) + memset(CPUmem, 0, n*sizeof(T)); + else + for (int i=0;i dim_): name(name_),dim(dim_){ + CPUmem = new T [numel()]; + }; + + void permute(std::vector v){ + size_t nbItems = numofitems(); + size_t sizeofitem_ = sizeofitem(); + size_t nbBytes = sizeofitem_ * sizeof(T); + T* CPUmemNew = new T[numel()]; + memcpy(CPUmemNew, CPUmem, nbItems * nbBytes); + for (size_t i=0;i display_dim){ + + std::cout<<" name:"< +std::vector*> readTensors(std::string filename, size_t max_count = SIZE_MAX){ + + FILE* fp = fopen(filename.c_str(),"rb"); + + while (fp==NULL) { + std::cerr<<"readTensors: fail to open file "<*> tensors; + size_t count = 0; + while (feof(fp)==0) { + tensors.push_back(new Tensor(fp)); + count++; + if (count>=max_count) break; + int c = getc(fp); + ungetc(c, fp); + } + fclose(fp); + return tensors; +} + +template +void writeTensors(std::string filename, std::vector*> tensors){ + FILE* fp = fopen(filename.c_str(),"wb"); + while (fp==NULL) { + std::cerr<<"writeTensors: fail to open file "<write(fp); + } + fclose(fp); +} + + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Response and Layer +////////////////////////////////////////////////////////////////////////////////////////////////// + +class Response{ +public: + std::string name; + cudnnTensorDescriptor_t desc; + cublasHandle_t cublasHandle; + std::vector desc_group; + std::vector number_group; + + bool isProxy; + + StorageT* dataGPU; + StorageT* diffGPU; + bool need_diff; + std::vector dim; + std::vector stride; + + std::vector receptive_field; + std::vector receptive_gap; + std::vector receptive_offset; + + size_t sizeofitem(){ return marvin::sizeofitem(dim); }; + + size_t numBytes(){ return sizeofStorageT*(marvin::numel(dim)); }; + + Response(std::string name_, bool need_diff_=false): name(name_), dataGPU(NULL), diffGPU(NULL), need_diff(need_diff_), isProxy(false){ + checkCUDNN(__LINE__,cudnnCreateTensorDescriptor(&desc)); + }; + + size_t Malloc(std::vector dim_, StorageT* dataGPUexisting=NULL, StorageT* diffGPUexisting=NULL){ + size_t memoryBytes = 0; + if (dataGPU==NULL){ // two layers (one for training, one for testing) may output to the same response and Malloc twice, ignore the second time + + dim = dim_; + stride.resize(dim.size()); + + stride[dim.size()-1] = 1; + for (int d=dim.size()-2;d>=0;--d){ + stride[d] = stride[d+1] * dim[d+1]; + } + + checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(desc, + CUDNNStorageT, + dim.size(), + &dim[0], + &stride[0]) ); + + std::cout<<" "; + std::cout<< (need_diff? "* " : " "); + + std::cout< dim_new = dim; + dim_new[1] = dim[1]/group; + checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(desc_new, + CUDNNStorageT, + dim_new.size(), + &dim_new[0], + &stride[0]) ); + desc_group.push_back(desc_new); + return desc_new; + } + + ~Response(){ + checkCUDNN(__LINE__,cudnnDestroyTensorDescriptor(desc)); + for (int i=0; i display_dim, bool printData=true){ + if (!printData && diffGPU==NULL) return; + Tensor* feature = new Tensor(dim); + feature->readGPU((printData? dataGPU: diffGPU)); + feature->print(display_dim); + delete feature; + }; + + + int checkNaN(){ + return marvin::checkNaN(dataGPU, numel(dim)); + }; + + int checkNaNdiff(){ + return marvin::checkNaN(diffGPU, numel(dim)); + }; + + ComputeT ameanData(){ + if (dataGPU!=NULL){ + ComputeT result; + size_t n = numel(dim); + //std::cout<<"n="< dim, Filler filler, + ComputeT param = 0) { + int n = numel(dim); + StorageT *CPUbuf = new StorageT[n]; + switch (filler) { + case Xavier: { + int fan_in = ComputeT(n / dim[0]); + ComputeT scale = sqrt(ComputeT(3) / fan_in); + + //default_random_engine generator; + std::uniform_real_distribution distribution(-scale, + scale); + for (StorageT *p = CPUbuf; p != CPUbuf + n; ++p) { + *p = CPUCompute2StorageT(distribution(rng)); + } + } + break; + case Gaussian: { + std::normal_distribution distribution(0, param); + for (StorageT *p = CPUbuf; p != CPUbuf + n; ++p) { + *p = CPUCompute2StorageT(distribution(rng)); + } + } + break; + case Constant: { + StorageT paramStorageT = CPUCompute2StorageT(param); + for (StorageT *p = CPUbuf; p != CPUbuf + n; ++p) { + *p = paramStorageT; + } + } + break; + } + checkCUDA(__LINE__, cudaMemcpy(GPUmem, CPUbuf, n * sizeofStorageT, + cudaMemcpyHostToDevice)); + + delete[] CPUbuf; + } + + void randInit() { + if (weight_dataGPU != NULL) fillGPU(weight_dataGPU, weight_dim, weight_filler, weight_filler_param); + if (bias_dataGPU != NULL) fillGPU(bias_dataGPU, bias_dim, bias_filler, bias_filler_param); + for(int l=0;lrandInit(); + }; + + void clearDiff() { + if (weight_diffGPU != NULL) + checkCUDA(__LINE__, cudaMemset(weight_diffGPU, 0, + sizeofStorageT * weight_numel)); + if (bias_diffGPU != NULL) + checkCUDA(__LINE__, cudaMemset(bias_diffGPU, 0, + sizeofStorageT * bias_numel)); + for(int l=0;lclearDiff(); + }; + + void clearHist() { + if (weight_diffGPU != NULL) + checkCUDA(__LINE__, cudaMemset(weight_histGPU, 0, + sizeofStorageT * weight_numel)); + if (bias_diffGPU != NULL) + checkCUDA(__LINE__, cudaMemset(bias_histGPU, 0, + sizeofStorageT * bias_numel)); + for(int l=0;lclearHist(); + }; + + void setWeights(std::vector *> weights) { + for (int i = 0; i < weights.size(); ++i) { + if (weight_dataGPU != NULL && + weights[i]->name == name + ".weight") { + if (numel(weight_dim) == numel(weights[i]->dim)) { + if (!same_dim(weight_dim, weights[i]->dim)) { + std::cout << "[Warning] " << name << + ".weight is loaded with mismatched dimensions "; + std::cout << "need"; + veciPrint(weight_dim); + std::cout << " vs. file"; + veciPrint(weights[i]->dim); + std::cout << std::endl; + } + std::cout << " " << name << ".weight"; + veciPrint(weights[i]->dim); + std::cout << " is set." << std::endl; + weights[i]->writeGPU(weight_dataGPU); + } else { + std::cout << "[Warning] " << name << + ".weight is found but not loaded because the numels are mismatched: "; + std::cout << "need"; + veciPrint(weight_dim); + std::cout << " vs. file"; + veciPrint(weights[i]->dim); + std::cout << std::endl; + } + } + if (bias_dataGPU != NULL && weights[i]->name == name + ".bias") { + if (numel(bias_dim) == numel(weights[i]->dim)) { + if (!same_dim(bias_dim, weights[i]->dim)) { + std::cout << "[Warning] " << name << + ".bias is loaded with mismatched dimensions "; + std::cout << "need"; + veciPrint(bias_dim); + std::cout << " vs. file"; + veciPrint(weights[i]->dim); + std::cout << std::endl; + } + std::cout << " " << name << ".bias"; + veciPrint(weights[i]->dim); + std::cout << " is set." << std::endl; + weights[i]->writeGPU(bias_dataGPU); + } else { + std::cout << "[Warning] " << name << + ".bias is found but not loaded because the numels are mismatched: "; + std::cout << "need"; + veciPrint(bias_dim); + std::cout << " vs. file"; + veciPrint(weights[i]->dim); + std::cout << std::endl; + } + + } + } + for(int l=0;lsetWeights(weights); + }; + + void saveWeights(FILE *fp) { + if (weight_dataGPU != NULL) { + Tensor *t = new Tensor( + name + ".weight", weight_dim); + t->readGPU(weight_dataGPU); + t->write(fp); + delete t; + } + + if (bias_dataGPU != NULL) { + Tensor *t = new Tensor( + name + ".bias", bias_dim); + t->readGPU(bias_dataGPU); + t->write(fp); + delete t; + } + + for(int l=0;lsaveWeights(fp); + }; + + void printWeights(std::vector display_weight, + std::vector display_bias) { + if (weight_dataGPU != NULL) { + Tensor *t = new Tensor( + name + ".weight", weight_dim); + t->readGPU(weight_dataGPU); + t->print(display_weight); + delete t; + } + if (bias_dataGPU != NULL) { + Tensor *t = new Tensor( + name + ".bias", bias_dim); + t->readGPU(bias_dataGPU); + t->print(display_bias); + delete t; + } + + for(int l=0;lprintWeights(display_weight,display_bias); + }; + + void setDiffs(std::vector *> weights) { + for (int i = 0; i < weights.size(); ++i) { + if (weight_diffGPU != NULL && + weights[i]->name == name + ".weight_diff") { + std::cout << " " << name << ".weight_diff"; + veciPrint(weights[i]->dim); + std::cout << " is set." << std::endl; + weights[i]->writeGPU(weight_diffGPU); + } + if (bias_diffGPU != NULL && + weights[i]->name == name + ".bias_diff") { + std::cout << " " << name << ".bias_diff"; + veciPrint(weights[i]->dim); + std::cout << " is set." << std::endl; + weights[i]->writeGPU(bias_diffGPU); + } + } + + for(int l=0;lsetDiffs(weights); + }; + + void saveDiffs(FILE *fp) { + if (weight_diffGPU != NULL) { + Tensor *t = new Tensor( + name + ".weight_diff", weight_dim); + t->readGPU(weight_diffGPU); + t->write(fp); + delete t; + } + + if (bias_diffGPU != NULL) { + Tensor *t = new Tensor( + name + ".bias_diff", bias_dim); + t->readGPU(bias_diffGPU); + t->write(fp); + delete t; + } + + for(int l=0;lsaveDiffs(fp); + }; + + void printDiffs(std::vector display_weight, + std::vector display_bias) { + if (weight_diffGPU != NULL) { + Tensor *t = new Tensor( + name + ".weight_diff", weight_dim); + t->readGPU(weight_diffGPU); + t->print(display_weight); + delete t; + } + if (bias_diffGPU != NULL) { + Tensor *t = new Tensor( + name + ".bias_diff", bias_dim); + t->readGPU(bias_diffGPU); + t->print(display_bias); + delete t; + } + for(int l=0;lprintDiffs(display_weight,display_bias); + }; + + void update() { + if (train_me) { + if (weight_numel > 0 && weight_histGPU != NULL) + bsa2b(weight_numel, weight_histGPU, weight_dataGPU); + if (bias_numel > 0 && bias_histGPU != NULL) + bsa2b(bias_numel, bias_histGPU, bias_dataGPU); + for(int l=0;lupdate(); + } + }; + +}; + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Layers +////////////////////////////////////////////////////////////////////////////////////////////////// + +class DataLayer : public Layer { +public: + // parameters: + bool random; + int counter; + int epoch; + bool isDataLayer(){ return true; }; + DataLayer(): counter(0), epoch(0), random(false){}; + DataLayer(std::string name_): Layer(name_), counter(0), epoch(0), random(false){}; + virtual int numofitems() = 0; + virtual void shuffle() = 0; +}; + + +class TensorLayer: public DataLayer { + StorageT* tensorGPU; +public: + std::vector files; + std::vector > dim; + + TensorLayer(std::string name_): DataLayer(name_), tensorGPU(NULL){ + train_me = false; + }; + + TensorLayer(JSON* json): tensorGPU(NULL){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetOrDie(json, files ) + train_me = false; + }; + + ~TensorLayer(){ + if (tensorGPU!=NULL) checkCUDA(__LINE__, cudaFree(tensorGPU)); + }; + + int numofitems(){ + return dim[0][0]; + }; + + void shuffle(){ + + }; + + void forward(Phase phase_){ + ++epoch; + }; + + size_t Malloc(Phase phase_){ + std::cout<< (train_me? "* " : " "); + std::cout<* tensorCPU = new Tensor(files[i]); + dim[i] = tensorCPU->dim; + out[i]->need_diff = false; + std::cout<<"tensorCPU->dim="; veciPrint(tensorCPU->dim); std::cout<Malloc(tensorCPU->dim); + checkCUDA(__LINE__, cudaMemcpy(out[i]->dataGPU, tensorCPU->CPUmem, tensorCPU-> numBytes(), cudaMemcpyHostToDevice) ); + delete tensorCPU; + } + return memoryBytes; + }; +}; + +class SequenceGenerationLayer: public DataLayer { + int channel; + size_t iter; +public: + int length; + int seed; + Response* resultResponse; + std::string result; + std::string map2char; + Tensor* tensor2char; + + SequenceGenerationLayer(std::string name_): DataLayer(name_), resultResponse(NULL), tensor2char(NULL), iter(0){ + train_me = false; + }; + + SequenceGenerationLayer(JSON* json): resultResponse(NULL), tensor2char(NULL), iter(0){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetOrDie(json, map2char ) + SetOrDie(json, length ) + SetOrDie(json, seed ) + SetOrDie(json, result ) + + train_me = false; + }; + + ~SequenceGenerationLayer(){ + if (tensor2char!=NULL) delete tensor2char; + }; + + int numofitems(){ + return length; + }; + + void shuffle(){}; + + void forward(Phase phase_){ + if (iter == 0){ + GPU_set_zeros(1, out[1]->dataGPU); + GPU_set_one_hot(channel, out[0]->dataGPU, seed); + + //std::cout<CPUmem[seed]; + }else{ + GPU_set_ones(1, out[1]->dataGPU); + + size_t maxID; + ComputeT maxValue; + GPU_maxElement(channel, resultResponse->dataGPU, &maxID, &maxValue); + + //maxID = iter%65; + //std::cout<<"CPU maxID="<CPUmem[maxID]; + + GPU_set_one_hot(channel, out[0]->dataGPU, maxID); + } + + ++iter; + + if (iter ==length) + ++epoch; + }; + + size_t Malloc(Phase phase_){ + std::cout<< (train_me? "* " : " "); + std::cout<(map2char); + channel = tensor2char->numel(); + + std::vector dim; + dim.push_back(1); + dim.push_back(1); + dim.push_back(channel); + + out[0]->need_diff = false; + memoryBytes += out[0]->Malloc(dim); + + out[1]->need_diff = false; + dim[2]=1; + memoryBytes += out[1]->Malloc(dim); + + return memoryBytes; + }; +}; + + + +class MemoryDataLayer : public DataLayer { + std::vector*> dataCPU; + public: + std::vector file_data; + std::vector file_mean; + std::vector scale; + std::vector mean; + int batch_size; + + int numofitems(){ + return dataCPU[0]->dim[0]; + }; + void init(){ + train_me = false; + std::cout<<"MemoryDataLayer "< (file_data[i],batch_size); + dataCPU[i]->print(veci(0)); + } + + if (file_mean.size()>0){ + for (int i =0;i* meanCPU = new Tensor(file_mean[i],batch_size); + meanCPU->print(veci(0)); + + if (meanCPU->numel() != dataCPU[i]->sizeofitem()){ + std::cerr<<"mean tensor file size error: "<dim); std::cerr<dim); std::cerr<CPUmem; + StorageT* dE = dataCPU[i]->CPUmem + dataCPU[i]->numel(); + + StorageT* m = meanCPU->CPUmem; + StorageT* mE = meanCPU->CPUmem + meanCPU->numel(); + + while(d!=dE){ + *d = CPUCompute2StorageT( CPUStorage2ComputeT(*d) - CPUStorage2ComputeT(*m) ); + ++m; + if (m==mE) m = meanCPU->CPUmem; + ++d; + } + delete meanCPU; + } + } + + for (int i =0;iCPUmem + dataCPU[i]->numel(); + for(StorageT* d = dataCPU[i]->CPUmem; d!=dE; ++d){ + *d = CPUCompute2StorageT( CPUStorage2ComputeT(*d) * scale[i] ); + } + } + } + for (int i =0;iCPUmem + dataCPU[i]->numel(); + for(StorageT* d = dataCPU[i]->CPUmem; d!=dE; ++d){ + *d = CPUCompute2StorageT( CPUStorage2ComputeT(*d) - mean[i] ); + } + } + } + + if (phase!=Testing) shuffle(); + } + + MemoryDataLayer(std::string name_, Phase phase_, std::vector file_data_, int batch_size_): DataLayer(name_), batch_size(batch_size_), file_data(file_data_){ + phase = phase_; + init(); + }; + MemoryDataLayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, phase, Training) + SetOrDie(json, file_data ) + SetValue(json, file_mean, std::vector(0)) + SetValue(json, batch_size, 64) + SetValue(json, scale, std::vector(0)) + SetValue(json, mean, std::vector(0)) + SetValue(json, random, true) + init(); + }; + ~MemoryDataLayer(){ + for (int i =0; ineed_diff = false; + std::vector data_dim = dataCPU[i]->dim; + data_dim[0] = batch_size; + out[i]->receptive_field.resize(data_dim.size()-2); fill_n(out[i]->receptive_field.begin(), data_dim.size()-2,1); + out[i]->receptive_gap.resize(data_dim.size()-2); fill_n(out[i]->receptive_gap.begin(), data_dim.size()-2,1); + out[i]->receptive_offset.resize(data_dim.size()-2); fill_n(out[i]->receptive_offset.begin(),data_dim.size()-2,0); + memoryBytes += out[i]->Malloc(data_dim); + } + return memoryBytes; + } + void shuffle(){ + if (!random) return; + std::vector v = randperm(dataCPU[0]->numofitems(), rng); + for(int i =0; i permute(v); + } + }; + + void forward(Phase phase_){ + if (counter + batch_size >= dataCPU[0]->numofitems() ){ + ++epoch; + if(phase!=Testing){ + shuffle(); + counter = 0; + } + } + for(int i =0; i dataGPU, dataCPU[i]->CPUmem + (size_t(counter) * size_t( dataCPU[i]->sizeofitem())), batch_size * dataCPU[i]->sizeofitem() * sizeofStorageT, cudaMemcpyHostToDevice) ); + } + counter+=batch_size; + if (counter >= dataCPU[0]->numofitems()) counter = 0; + }; +}; + +class PlaceHolderDataLayer : public DataLayer { + public: + std::vector dim; + + int numofitems(){ + return 1; + }; + void init(){ + train_me = false; + std::cout<<"PlaceHolderDataLayer "<need_diff = false; + out[0]->receptive_field.resize(dim.size()-2); fill_n(out[0]->receptive_field.begin(), dim.size()-2,1); + out[0]->receptive_gap.resize(dim.size()-2); fill_n(out[0]->receptive_gap.begin(), dim.size()-2,1); + out[0]->receptive_offset.resize(dim.size()-2); fill_n(out[0]->receptive_offset.begin(),dim.size()-2,0); + memoryBytes += out[0]->Malloc(dim); + + return memoryBytes; + } + void shuffle(){}; + void forward(Phase phase_){}; +}; + +template +class DiskDataLayer : public DataLayer { + std::future lock; + + std::vector ordering; + std::bernoulli_distribution* distribution_bernoulli; + std::vector*> distribution_uniform; + + std::vector dataFILE; + std::vector dataCPU; + std::vector dataGPU; + std::vector item_raw; + + Tensor* labelCPUall; + Tensor* labelCPU; + std::vector mean_data_GPU; + StorageT* labelGPU; + + size_t numel_per_channel_crop ; + size_t numel_all_channel_crop ; + size_t numel_per_channel_orgi ; + size_t numel_batch_all_channel_crop ; + + int epoch_prefetch; + + size_t bytes_per_item; + size_t headerBytes; + std::vector size_data; + public: + bool mirror; + std::vector size_crop; + std::vector file_data; + std::vector file_mean; + std::string file_label; + int batch_size; + + int numofitems(){ + return labelCPUall->numofitems(); + }; + + void init(){ + epoch_prefetch = 0; + distribution_bernoulli = new std::bernoulli_distribution(0.5); + //dataFILE = NULL; + //dataCPU = NULL; + //dataGPU = NULL; + labelCPU = NULL; + labelGPU = NULL; + labelCPUall = NULL; + train_me = false; + std::cout<<"DiskDataLayer "<* meanCPU = new Tensor(file_mean[i],batch_size); + meanCPU->print(veci(0)); + checkCUDA(__LINE__, cudaMalloc(&mean_data_GPU[i], meanCPU->numBytes()) ); + meanCPU->writeGPU(mean_data_GPU[i]); + delete meanCPU; + } + + + Tensor tensor; + headerBytes = tensor.readHeader(dataFILE[0]); + + size_data.insert( size_data.end(), tensor.dim.begin()+1, tensor.dim.end() ); + + + numel_per_channel_crop = numel(size_crop); + numel_all_channel_crop = size_data[0] * numel_per_channel_crop; + numel_per_channel_orgi = sizeofitem(size_data); + numel_batch_all_channel_crop = batch_size*numel_all_channel_crop; + bytes_per_item = sizeof(T)* numel(size_data); + + std::vector data_dim; + data_dim.push_back(batch_size); + data_dim.push_back(size_data[0]); + data_dim.insert( data_dim.end(), size_crop.begin(), size_crop.end() ); + + for (int i = 0;i(file_label); + labelCPUall -> print(veci(0)); + std::cout<<" "; labelCPUall->printRange(); + while (labelCPUall->dim.size()dim.push_back(1); + std::vector label_dim = labelCPUall->dim; + label_dim[0] = batch_size; + labelCPU = new Tensor(label_dim); + + + distribution_uniform.resize(size_crop.size()); + for (int d=0; d(0,size_data[d+1] - size_crop[d]); + } + + if (phase!=Testing){ + shuffle(); + }else{ + ordering.resize(numofitems()); + for (int i=0;i size_data_, std::vector size_crop_, std::vector file_data_, std::string file_label_, int batch_size_): + DataLayer(name_), mirror(mirror_), size_data(size_data_), size_crop(size_crop_), file_data(file_data_), file_label(file_label_), batch_size(batch_size_){ + phase = phase_; + init(); + }; + + DiskDataLayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, phase, Training) + SetValue(json, mirror, false) + SetOrDie(json, file_data ) + SetValue(json, file_mean, std::vector(0)) + SetValue(json, file_label,"") + SetOrDie(json, batch_size ) + SetOrDie(json, size_crop ) + SetValue(json, random, true) + init(); + }; + + ~DiskDataLayer(){ + if (lock.valid()) lock.wait(); + delete distribution_bernoulli; + for (int i=0;inumofitems(), rng); + } + }; + + void prefetch(){ + + checkCUDA(__LINE__,cudaSetDevice(GPU)); + + + std::vector begin_coor(size_crop.size()); + + for (size_t i=0;isizeofitem(); + memcpy(labelCPU->CPUmem+i*labelSizeOfItem, labelCPUall->CPUmem+image_i*labelSizeOfItem, labelSizeOfItem*sizeofStorageT); + + // mirror + bool mirror_this = false; + if (mirror) mirror_this = ((*distribution_bernoulli)(rng)); + if (numel_per_channel_orgi != numel_per_channel_crop || mirror_this){ + for (int d=0;d= ordering.size()){ + if (phase!=Testing) shuffle(); + counter = 0; + ++epoch_prefetch; + } + }//end for (size_t i=0;iwriteGPU(labelGPU); + + }; + + void forward(Phase phase_){ + lock.wait(); + epoch = epoch_prefetch; + for (int data_i = 0; data_i>>(CUDA_GET_LOOPS(numel_batch_all_channel_crop), numel_batch_all_channel_crop, numel_all_channel_crop, dataGPU[data_i], mean_data, out[data_i]->dataGPU); + } + std::swap(out[file_data.size()]->dataGPU,labelGPU); + lock = std::async(std::launch::async,&DiskDataLayer::prefetch,this); + }; + + + size_t Malloc(Phase phase_){ + + if (phase == Training && phase_==Testing) return 0; + + + + if (out.size()!=file_data.size()+1){ std::cout<<"DiskDataLayer: # of out's should match the # of in-1"< data_dim; + data_dim.push_back(batch_size); + data_dim.push_back(size_data[0]); + data_dim.insert( data_dim.end(), size_crop.begin(), size_crop.end() ); + + for (int data_i = 0; data_ineed_diff = false; + out[data_i]->receptive_field.resize(data_dim.size()-2); fill_n(out[data_i]->receptive_field.begin(),data_dim.size()-2,1); + out[data_i]->receptive_gap.resize(data_dim.size()-2); fill_n(out[data_i]->receptive_gap.begin(),data_dim.size()-2,1); + out[data_i]->receptive_offset.resize(data_dim.size()-2);fill_n(out[data_i]->receptive_offset.begin(),data_dim.size()-2,0); + memoryBytes += out[data_i]->Malloc(data_dim); + } + + out[file_data.size()]->need_diff = false; + memoryBytes += out[file_data.size()]->Malloc(labelCPU->dim); + checkCUDA(__LINE__, cudaMalloc(&labelGPU, labelCPU->numBytes()) ); + memoryBytes += labelCPU->numBytes(); + + for (int data_i = 0; data_i::prefetch,this); + + return memoryBytes; + }; +}; + + +class ConvolutionLayer : public Layer { + cudnnFilterDescriptor_t filter_desc; + cudnnTensorDescriptor_t bias_desc; + cudnnConvolutionDescriptor_t conv_desc; + + std::vector fwdAlgoWorkspaces; + std::vector bwdDataAlgoWorkspaces; + std::vector bwdFilterAlgoWorkspaces; + + std::vector fwdAlgoWorkspaceSizes; + std::vector bwdDataAlgoWorkspaceSizes; + std::vector bwdFilterAlgoWorkspaceSizes; +public: + cudnnConvolutionFwdAlgo_t fwdAlgo; + cudnnConvolutionBwdDataAlgo_t bwdDataAlgo; + cudnnConvolutionBwdFilterAlgo_t bwdFilterAlgo; + + int num_output; + std::vector window; + std::vector stride; + std::vector padding; + std::vector upscale; + int group; + + void init(){ + weight_dim.push_back(num_output); + weight_dim.push_back(0); // need the channel size from the input + weight_dim.insert( weight_dim.end(), window.begin(), window.end() ); + + bias_dim.resize(weight_dim.size(), 1); + bias_dim[1] = num_output; + }; + + ConvolutionLayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetValue(json, train_me, true) + SetOrDie(json, num_output ) + SetOrDie(json, window ) + SetValue(json, weight_lr_mult, 1.0) + SetValue(json, weight_filler, Xavier) + SetValue(json, weight_filler_param, 0.0) + SetValue(json, bias_lr_mult, 2.0) + SetValue(json, bias_filler, Constant) + SetValue(json, bias_filler_param, 0.0) + SetValue(json, weight_decay_mult, 1.0) + SetValue(json, bias_decay_mult, 1.0) + SetValue(json, group, 1) + + std::vector ones = std::vector(window.size(),1); + std::vector zeros = std::vector(window.size(),0); + SetValue(json, padding, zeros) + SetValue(json, stride, ones) + SetValue(json, upscale, ones) + SetValue(json, fwdAlgo, CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM) + SetValue(json, bwdDataAlgo, CUDNN_CONVOLUTION_BWD_DATA_ALGO_0) + SetValue(json, bwdFilterAlgo, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0) + + init(); + }; + + ConvolutionLayer(std::string name_, + int num_output_, + std::vector window_, + std::vector padding_, std::vector stride_, std::vector upscale_, + ComputeT weight_lr_mult_, Filler weight_filler_, ComputeT weight_filler_param_, + ComputeT bias_lr_mult_, Filler bias_filler_, ComputeT bias_filler_param_): + Layer(name_), + num_output(num_output_), window(window_), stride(stride_), padding(padding_), upscale(upscale_){ + + weight_lr_mult = weight_lr_mult_; + weight_filler = weight_filler_; + weight_filler_param = weight_filler_param_; + + bias_lr_mult = bias_lr_mult_; + bias_filler = bias_filler_; + bias_filler_param = bias_filler_param_; + + init(); + }; + + size_t Malloc(Phase phase_){ + size_t memoryBytes = 0; + train_me = train_me && phase_ != Testing; + + std::cout<< (train_me? "* " : " "); + std::cout<1) std::cout<<" ("<dim[1]/group; + + // create descriptor + checkCUDNN(__LINE__,cudnnCreateFilterDescriptor(&filter_desc) ); + checkCUDNN(__LINE__,cudnnCreateTensorDescriptor(&bias_desc) ); + checkCUDNN(__LINE__,cudnnCreateConvolutionDescriptor(&conv_desc) ); + // set descriptor + // set the parameters for convolution + + std::vector weight_dim_group = weight_dim; + weight_dim_group[0] = weight_dim[0]/group; + + checkCUDNN(__LINE__,cudnnSetFilterNdDescriptor(filter_desc, + CUDNNStorageT, + CUDNN_TENSOR_NCHW, + weight_dim.size(), + &weight_dim_group[0]) ); + + checkCUDNN(__LINE__,cudnnSetConvolutionNdDescriptor(conv_desc, + padding.size(), + &padding[0], + &stride[0], + &upscale[0], + CUDNN_CROSS_CORRELATION, + CUDNNConvComputeT) ); + + std::vector bias_stride(bias_dim.size()); + + bias_stride[bias_dim.size()-1] = 1; + for (int d=bias_dim.size()-2;d>=0;--d){ + bias_stride[d] = bias_stride[d+1] * bias_dim[d+1]; + } + checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(bias_desc, + CUDNNStorageT, + bias_dim.size(), + &bias_dim[0], + &bias_stride[0]) ); + + + weight_numel = numel(weight_dim); + bias_numel = numel(bias_dim); + + if (weight_numel>0){ + std::cout<<" weight"; veciPrint(weight_dim); + checkCUDA(__LINE__, cudaMalloc( &weight_dataGPU, weight_numel * sizeofStorageT) ); + memoryBytes += weight_numel * sizeofStorageT; + } + if (bias_numel>0){ + std::cout<<" bias"; veciPrint(bias_dim); + checkCUDA(__LINE__, cudaMalloc( &bias_dataGPU, bias_numel * sizeofStorageT) ); + memoryBytes += bias_numel * sizeofStorageT; + } + std::cout<need_diff = train_me || in[i]->need_diff; // if one of them need the grad + + std::vector dimOut; + dimOut.resize(in[i]->dim.size()); + + checkCUDNN(__LINE__,cudnnGetConvolutionNdForwardOutputDim(conv_desc, + in[i]->getDesc(group), + filter_desc, + dimOut.size(), + &dimOut[0] + )); + dimOut[1] *= group; + + size_t dall = in[i]->receptive_field.size(); + out[i]->receptive_field .resize(dall); + out[i]->receptive_gap .resize(dall); + out[i]->receptive_offset.resize(dall); + for(size_t d=0;dreceptive_field[d] = in[i]->receptive_field[d] + ComputeT(window[d]-1) * in[i]->receptive_gap[d]; + out[i]->receptive_gap[d] = stride[d] * in[i]->receptive_gap[d]; + out[i]->receptive_offset[d] = in[i]->receptive_offset[d] - ComputeT(padding[d]) * in[i]->receptive_gap[d]; + } + memoryBytes += out[i]->Malloc(dimOut); + + + } + + // Allocate workspace + fwdAlgoWorkspaces.resize(in.size()); + bwdDataAlgoWorkspaces.resize(out.size()); + bwdFilterAlgoWorkspaces.resize(out.size()); + + fwdAlgoWorkspaceSizes.resize(in.size()); + bwdDataAlgoWorkspaceSizes.resize(out.size()); + bwdFilterAlgoWorkspaceSizes.resize(out.size()); + + for (int i=0;igetDesc(group), + filter_desc, + conv_desc, + out[i]->getDesc(group), + fwdAlgo, + &fwdAlgoWorkspaceSizes[i])); + checkCUDA(__LINE__, cudaMalloc( &fwdAlgoWorkspaces[i], fwdAlgoWorkspaceSizes[i]) ); + } + + for (int i=0;igetDesc(group), + conv_desc, + in[i]->getDesc(group), + bwdDataAlgo, + &bwdDataAlgoWorkspaceSizes[i])); + + checkCUDNN(__LINE__,cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle, + in[i]->getDesc(group), + out[i]->getDesc(group), + conv_desc, + filter_desc, + bwdFilterAlgo, + &bwdFilterAlgoWorkspaceSizes[i])); + + checkCUDA(__LINE__, cudaMalloc( &bwdDataAlgoWorkspaces[i], bwdDataAlgoWorkspaceSizes[i]) ); + checkCUDA(__LINE__, cudaMalloc( &bwdFilterAlgoWorkspaces[i], bwdFilterAlgoWorkspaceSizes[i]) ); + } + + return memoryBytes; + }; + + void forward(Phase phase_){ + + for (int i=0;igetDesc(group), + in[i]->dataGPU + (g * in[i]->sizeofitem() / group), + filter_desc, + weight_dataGPU + (g * weight_numel / group), + conv_desc, + fwdAlgo, // CUDNN For 3-d convolutions, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is supported; support is provided for any format for srcDesc and destDesc as well as support for all data type configurations. + fwdAlgoWorkspaces[i], + fwdAlgoWorkspaceSizes[i], + zero, + out[i]->getDesc(group), + out[i]->dataGPU + (g * out[i]->sizeofitem() / group) ) ); + + } + + if (bias_dim.size()<=5){ + checkCUDNN(__LINE__,cudnnAddTensor(cudnnHandle, + one, + bias_desc, + bias_dataGPU, + one, + out[i]->desc, + out[i]->dataGPU) ); + }else{ + std::vector bias_dim_bug; + bias_dim_bug.push_back(bias_dim[0]); + bias_dim_bug.push_back(bias_dim[1]); + bias_dim_bug.push_back(bias_dim[2]); + bias_dim_bug.push_back(1); + for (int d=3;d bias_stride(bias_dim_bug.size()); + bias_stride[bias_dim_bug.size()-1] = 1; + for (int d=bias_dim_bug.size()-2;d>=0;--d){ + bias_stride[d] = bias_stride[d+1] * bias_dim_bug[d+1]; + } + cudnnTensorDescriptor_t bias_desc_bug; + checkCUDNN(__LINE__,cudnnCreateTensorDescriptor(&bias_desc_bug) ); + checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(bias_desc_bug, + CUDNNStorageT, + bias_dim_bug.size(), + &bias_dim_bug[0], + &bias_stride[0]) ); + std::vector out_dim_bug; + out_dim_bug.push_back(out[i]->dim[0]); + out_dim_bug.push_back(out[i]->dim[1]); + out_dim_bug.push_back(out[i]->dim[2]); + out_dim_bug.push_back(1); + for (int d=3;ddim.size();++d) out_dim_bug[3] *= out[i]->dim[d]; + std::vector strideA(out_dim_bug.size()); + strideA[out_dim_bug.size()-1] = 1; + for (int d=out_dim_bug.size()-2;d>=0;--d) strideA[d] = strideA[d+1] * out_dim_bug[d+1]; + cudnnTensorDescriptor_t out_desc_bug; + checkCUDNN(__LINE__,cudnnCreateTensorDescriptor(&out_desc_bug)); + checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(out_desc_bug, + CUDNNStorageT, + out_dim_bug.size(), + &out_dim_bug[0], + &strideA[0]) ); + checkCUDNN(__LINE__,cudnnAddTensor(cudnnHandle, + one, + bias_desc_bug, + bias_dataGPU, + one, + out_desc_bug, + out[i]->dataGPU) ); + checkCUDNN(__LINE__,cudnnDestroyTensorDescriptor(bias_desc_bug) ); + checkCUDNN(__LINE__,cudnnDestroyTensorDescriptor(out_desc_bug) ); + } + } + }; + void backward(Phase phase_){ + for (int i=0;ineed_diff){ + for (int g = 0; g < group; g++) { + checkCUDNN(__LINE__,cudnnConvolutionBackwardData(cudnnHandle, + one, + filter_desc, weight_dataGPU + (g * weight_numel / group), + out[i]->getDesc(group), out[i]->diffGPU + (g * out[i]->sizeofitem() / group), + conv_desc, + bwdDataAlgo, bwdDataAlgoWorkspaces[i], bwdDataAlgoWorkspaceSizes[i], + one, + in[i]->getDesc(group), in[i]->diffGPU + (g * in[i]->sizeofitem() / group))); + } + } + } + // compute in->diff first because the next layer need to use it immediate, and because weight_diff needs to write to another GPU + for (int i=0;i0){ + for (int g = 0; g < group; g++) { + checkCUDNN(__LINE__,cudnnConvolutionBackwardFilter(cudnnHandle, + one, + in[i]->getDesc(group), in[i]->dataGPU + (g * in[i]->sizeofitem() / group), + out[i]->getDesc(group), out[i]->diffGPU + (g * out[i]->sizeofitem() / group), + conv_desc, + bwdFilterAlgo, bwdFilterAlgoWorkspaces[i], bwdFilterAlgoWorkspaceSizes[i], + &beta, + filter_desc, weight_diffGPU + (g * weight_numel / group))); + } + } + if (bias_numel>0){ + checkCUDNN(__LINE__,cudnnConvolutionBackwardBias(cudnnHandle, + one, + out[i]->desc, out[i]->diffGPU, + &beta, + bias_desc, bias_diffGPU)); + } + } + } + }; + ~ConvolutionLayer(){ + // destory the descriptor + checkCUDNN(__LINE__,cudnnDestroyFilterDescriptor(filter_desc) ); + checkCUDNN(__LINE__,cudnnDestroyTensorDescriptor(bias_desc) ); + checkCUDNN(__LINE__,cudnnDestroyConvolutionDescriptor(conv_desc) ); + + for (int i=0;i fwdAlgoWorkspaces; + std::vector bwdDataAlgoWorkspaces; + std::vector bwdFilterAlgoWorkspaces; + + std::vector fwdAlgoWorkspaceSizes; + std::vector bwdDataAlgoWorkspaceSizes; + std::vector bwdFilterAlgoWorkspaceSizes; +public: + cudnnConvolutionFwdAlgo_t fwdAlgo; + cudnnConvolutionBwdDataAlgo_t bwdDataAlgo; + cudnnConvolutionBwdFilterAlgo_t bwdFilterAlgo; + + int num_output; + std::vector window; + std::vector stride; + std::vector padding; + std::vector upscale; + int group; + + void init(){ + weight_dim.push_back(0); + weight_dim.push_back(0); // need the channel size from the input + weight_dim.insert( weight_dim.end(), window.begin(), window.end() ); + + bias_dim.resize(weight_dim.size(), 1); + bias_dim[1] = num_output; + }; + + DeconvolutionLayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetValue(json, train_me, true) + SetOrDie(json, num_output ) + SetOrDie(json, window ) + SetValue(json, weight_lr_mult, 1.0) + SetValue(json, weight_filler, Xavier) + SetValue(json, weight_filler_param, 0.0) + SetValue(json, bias_lr_mult, 2.0) + SetValue(json, bias_filler, Constant) + SetValue(json, bias_filler_param, 0.0) + SetValue(json, weight_decay_mult, 1.0) + SetValue(json, bias_decay_mult, 1.0) + SetValue(json, group, 1) + + std::vector ones = std::vector(window.size(),1); + std::vector zeros = std::vector(window.size(),0); + SetValue(json, padding, zeros) + SetValue(json, stride, ones) + SetValue(json, upscale, ones) + SetValue(json, fwdAlgo, CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM) + SetValue(json, bwdDataAlgo, CUDNN_CONVOLUTION_BWD_DATA_ALGO_0) + SetValue(json, bwdFilterAlgo, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0) + + init(); + }; + + DeconvolutionLayer(std::string name_, + int num_output_, + std::vector window_, + std::vector padding_, std::vector stride_, std::vector upscale_, + ComputeT weight_lr_mult_, Filler weight_filler_, ComputeT weight_filler_param_, + ComputeT bias_lr_mult_, Filler bias_filler_, ComputeT bias_filler_param_): + Layer(name_), + num_output(num_output_), window(window_), stride(stride_), padding(padding_), upscale(upscale_){ + + weight_lr_mult = weight_lr_mult_; + weight_filler = weight_filler_; + weight_filler_param = weight_filler_param_; + + bias_lr_mult = bias_lr_mult_; + bias_filler = bias_filler_; + bias_filler_param = bias_filler_param_; + + init(); + }; + + size_t Malloc(Phase phase_){ + size_t memoryBytes = 0; + train_me = train_me && phase_ != Testing; + + std::cout<< (train_me? "* " : " "); + std::cout<1) std::cout<<" ("<dim[1]; + weight_dim[1] = num_output/group; + + // create descriptor + checkCUDNN(__LINE__,cudnnCreateFilterDescriptor(&filter_desc) ); + checkCUDNN(__LINE__,cudnnCreateTensorDescriptor(&bias_desc) ); + checkCUDNN(__LINE__,cudnnCreateConvolutionDescriptor(&conv_desc) ); + // set descriptor + // set the parameters for convolution + + std::vector weight_dim_group = weight_dim; + weight_dim_group[0] = weight_dim[0]/group; + + checkCUDNN(__LINE__,cudnnSetFilterNdDescriptor(filter_desc, + CUDNNStorageT, + CUDNN_TENSOR_NCHW, + weight_dim.size(), + &weight_dim_group[0]) ); + + checkCUDNN(__LINE__,cudnnSetConvolutionNdDescriptor(conv_desc, + padding.size(), + &padding[0], + &stride[0], + &upscale[0], + CUDNN_CROSS_CORRELATION, + CUDNNConvComputeT) ); + + std::vector bias_stride(bias_dim.size()); + + bias_stride[bias_dim.size()-1] = 1; + for (int d=bias_dim.size()-2;d>=0;--d){ + bias_stride[d] = bias_stride[d+1] * bias_dim[d+1]; + } + checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(bias_desc, + CUDNNStorageT, + bias_dim.size(), + &bias_dim[0], + &bias_stride[0]) ); + + weight_numel = numel(weight_dim); + bias_numel = numel(bias_dim); + + if (weight_numel>0){ + std::cout<<" weight"; veciPrint(weight_dim); + checkCUDA(__LINE__, cudaMalloc( &weight_dataGPU, weight_numel * sizeofStorageT) ); + memoryBytes += weight_numel * sizeofStorageT; + } + if (bias_numel>0){ + std::cout<<" bias"; veciPrint(bias_dim); + checkCUDA(__LINE__, cudaMalloc( &bias_dataGPU, bias_numel * sizeofStorageT) ); + memoryBytes += bias_numel * sizeofStorageT; + } + std::cout<need_diff = train_me || in[i]->need_diff; // if one of them need the grad + + std::vector dimOut; + dimOut.resize(in[i]->dim.size()); + + dimOut[0] = in[i]->dim[0]; + dimOut[1] = num_output; + for (int d=0;ddim[2+d]-1)*stride[d] + window[d] - 2*padding[d]; + } + + size_t dall = in[i]->receptive_field.size(); + out[i]->receptive_field .resize(dall); + out[i]->receptive_gap .resize(dall); + out[i]->receptive_offset.resize(dall); + for(size_t d=0;dreceptive_gap[d] = in[i]->receptive_gap[d] / stride[d]; + out[i]->receptive_field[d] = in[i]->receptive_field[d] - ComputeT(window[d]-1) * in[i]->receptive_gap[d]; + out[i]->receptive_offset[d] = in[i]->receptive_offset[d] + ComputeT(padding[d]) * in[i]->receptive_gap[d]; + } + memoryBytes += out[i]->Malloc(dimOut); + } + + // Allocate workspace + fwdAlgoWorkspaces.resize(in.size()); + bwdDataAlgoWorkspaces.resize(out.size()); + bwdFilterAlgoWorkspaces.resize(out.size()); + + fwdAlgoWorkspaceSizes.resize(in.size()); + bwdDataAlgoWorkspaceSizes.resize(out.size()); + bwdFilterAlgoWorkspaceSizes.resize(out.size()); + + for (int i=0;igetDesc(group), + filter_desc, + conv_desc, + in[i]->getDesc(group), + fwdAlgo, + &fwdAlgoWorkspaceSizes[i])); + checkCUDA(__LINE__, cudaMalloc( &fwdAlgoWorkspaces[i], fwdAlgoWorkspaceSizes[i]) ); + } + + for (int i=0;igetDesc(group), + conv_desc, + out[i]->getDesc(group), + bwdDataAlgo, + &bwdDataAlgoWorkspaceSizes[i])); + + checkCUDNN(__LINE__,cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle, + out[i]->getDesc(group), + in[i]->getDesc(group), + conv_desc, + filter_desc, + bwdFilterAlgo, + &bwdFilterAlgoWorkspaceSizes[i])); + + checkCUDA(__LINE__, cudaMalloc( &bwdDataAlgoWorkspaces[i], bwdDataAlgoWorkspaceSizes[i]) ); + checkCUDA(__LINE__, cudaMalloc( &bwdFilterAlgoWorkspaces[i], bwdFilterAlgoWorkspaceSizes[i]) ); + } + + return memoryBytes; + }; + + void forward(Phase phase_){ + + for (int i=0;igetDesc(group), + in[i]->dataGPU + (g * in[i]->sizeofitem() / group), + conv_desc, + bwdDataAlgo, bwdDataAlgoWorkspaces[i], bwdDataAlgoWorkspaceSizes[i], + zero, + out[i]->getDesc(group), + out[i]->dataGPU + (g * out[i]->sizeofitem() / group))); + } + + if (bias_dim.size()<=5){ + checkCUDNN(__LINE__,cudnnAddTensor(cudnnHandle, + one, + bias_desc, + bias_dataGPU, + one, + out[i]->desc, + out[i]->dataGPU) ); + }else{ + std::vector bias_dim_bug; + bias_dim_bug.push_back(bias_dim[0]); + bias_dim_bug.push_back(bias_dim[1]); + bias_dim_bug.push_back(bias_dim[2]); + bias_dim_bug.push_back(1); + for (int d=3;d bias_stride(bias_dim_bug.size()); + bias_stride[bias_dim_bug.size()-1] = 1; + for (int d=bias_dim_bug.size()-2;d>=0;--d){ + bias_stride[d] = bias_stride[d+1] * bias_dim_bug[d+1]; + } + cudnnTensorDescriptor_t bias_desc_bug; + checkCUDNN(__LINE__,cudnnCreateTensorDescriptor(&bias_desc_bug) ); + checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(bias_desc_bug, + CUDNNStorageT, + bias_dim_bug.size(), + &bias_dim_bug[0], + &bias_stride[0]) ); + std::vector out_dim_bug; + out_dim_bug.push_back(out[i]->dim[0]); + out_dim_bug.push_back(out[i]->dim[1]); + out_dim_bug.push_back(out[i]->dim[2]); + out_dim_bug.push_back(1); + for (int d=3;ddim.size();++d) out_dim_bug[3] *= out[i]->dim[d]; + std::vector strideA(out_dim_bug.size()); + strideA[out_dim_bug.size()-1] = 1; + for (int d=out_dim_bug.size()-2;d>=0;--d) strideA[d] = strideA[d+1] * out_dim_bug[d+1]; + cudnnTensorDescriptor_t out_desc_bug; + checkCUDNN(__LINE__,cudnnCreateTensorDescriptor(&out_desc_bug)); + checkCUDNN(__LINE__,cudnnSetTensorNdDescriptor(out_desc_bug, + CUDNNStorageT, + out_dim_bug.size(), + &out_dim_bug[0], + &strideA[0]) ); + checkCUDNN(__LINE__,cudnnAddTensor(cudnnHandle, + one, + bias_desc_bug, + bias_dataGPU, + one, + out_desc_bug, + out[i]->dataGPU) ); + checkCUDNN(__LINE__,cudnnDestroyTensorDescriptor(bias_desc_bug) ); + checkCUDNN(__LINE__,cudnnDestroyTensorDescriptor(out_desc_bug) ); + } + } + }; + void backward(Phase phase_){ + for (int i=0;ineed_diff){ + for (int g = 0; g < group; g++) { + checkCUDNN(__LINE__,cudnnConvolutionForward(cudnnHandle, + one, + out[i]->getDesc(group), + out[i]->diffGPU + (g * out[i]->sizeofitem() / group), + filter_desc, + weight_dataGPU + (g * weight_numel / group), + conv_desc, + fwdAlgo, // CUDNN For 3-d convolutions, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is supported; support is provided for any format for srcDesc and destDesc as well as support for all data type configurations. + fwdAlgoWorkspaces[i], + fwdAlgoWorkspaceSizes[i], + one, + in[i]->getDesc(group), + in[i]->diffGPU + (g * in[i]->sizeofitem() / group))); + } + } + } + // compute in->diff first because the next layer need to use it immediate, and because weight_diff needs to write to another GPU + for (int i=0;i0){ + for (int g = 0; g < group; g++) { + checkCUDNN(__LINE__,cudnnConvolutionBackwardFilter(cudnnHandle, + one, + out[i]->getDesc(group), out[i]->diffGPU + (g * out[i]->sizeofitem() / group), + in[i]->getDesc(group), in[i]->dataGPU + (g * in[i]->sizeofitem() / group), + conv_desc, + bwdFilterAlgo, bwdFilterAlgoWorkspaces[i], bwdFilterAlgoWorkspaceSizes[i], + &beta, + filter_desc, weight_diffGPU + (g * weight_numel / group))); + } + } + if (bias_numel>0){ + checkCUDNN(__LINE__,cudnnConvolutionBackwardBias(cudnnHandle, + one, + out[i]->desc, out[i]->diffGPU, + &beta, + bias_desc, bias_diffGPU)); + } + } + } + }; + ~DeconvolutionLayer(){ + // destory the descriptor + checkCUDNN(__LINE__,cudnnDestroyFilterDescriptor(filter_desc) ); + checkCUDNN(__LINE__,cudnnDestroyTensorDescriptor(bias_desc) ); + checkCUDNN(__LINE__,cudnnDestroyConvolutionDescriptor(conv_desc) ); + + for (int i=0;idim); + num_items = in[0]->dim[0]; + + weight_dim.resize(2); + weight_dim[0] = num_output; + weight_dim[1] = num_input; + weight_numel = numel(weight_dim); + + if (bias_term){ + bias_dim.resize(1); + bias_dim[0] = num_output; + bias_numel = numel(bias_dim); + }else{ + bias_numel = 0; + } + + + if (weight_numel>0){ + std::cout<<" weight"; veciPrint(weight_dim); + checkCUDA(__LINE__, cudaMalloc(&weight_dataGPU, weight_numel * sizeofStorageT) ); + memoryBytes += weight_numel * sizeofStorageT; + } + + if (bias_numel>0){ + std::cout<<" bias"; veciPrint(bias_dim); + checkCUDA(__LINE__, cudaMalloc(&bias_dataGPU, bias_numel * sizeofStorageT) ); + memoryBytes += bias_numel * sizeofStorageT; + checkCUDA(__LINE__, cudaMalloc(&bias_multGPU, num_items * sizeofStorageT) ); + Kernel_set_value<<>>(CUDA_GET_LOOPS(num_items), num_items, bias_multGPU, CPUCompute2StorageT(1)); + memoryBytes += num_items * sizeofStorageT; + } + std::cout<need_diff = train_me || in[i]->need_diff; // if one of them need the grad + std::vector dimOut(in[i]->dim.size()); + dimOut[0] = in[i]->dim[0]; + dimOut[1] = num_output; + for (int d=2;ddim.size();++d) + dimOut[d] = 1; + + size_t dall = in[i]->receptive_field.size(); + out[i]->receptive_field .resize(dall); + out[i]->receptive_gap .resize(dall); + out[i]->receptive_offset.resize(dall); + + for(size_t d=0;dreceptive_field[d] = in[i]->receptive_field[d] + ComputeT(in[i]->dim[d+2]-1) * in[i]->receptive_gap[d]; + out[i]->receptive_gap[d] = 0; + out[i]->receptive_offset[d] = 0; + + } + + memoryBytes += out[i]->Malloc(dimOut); + + } + return memoryBytes; + }; + + void forward(Phase phase_){ + for (int i=0;idataGPU, num_input, zeroComputeT, out[i]->dataGPU, num_output) ); + if (bias_numel>0) + checkCUBLAS(__LINE__, GPUgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, num_output, num_items, 1, oneComputeT, bias_dataGPU, num_output, bias_multGPU, 1, oneComputeT, out[i]->dataGPU, num_output) ); + } + }; + + void backward(Phase phase_){ + for (int i=0;ineed_diff){ + checkCUBLAS(__LINE__, GPUgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, num_input, num_items, num_output, oneComputeT, weight_dataGPU, num_input, out[i]->diffGPU, num_output, oneComputeT, in[i]->diffGPU, num_input) ); + } + } + + for (int i=0;i0){ + checkCUBLAS(__LINE__, GPUgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, num_input, num_output, num_items, oneComputeT, in[i]->dataGPU, num_input, out[i]->diffGPU, num_output, &beta, weight_diffGPU, num_input) ); + } + if (bias_numel>0){ + checkCUBLAS(__LINE__, GPUgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, num_output, 1, num_items, oneComputeT, out[i]->diffGPU, num_output, bias_multGPU, num_items, &beta, bias_diffGPU, num_output) ); + } + } + } + }; + + ~InnerProductLayer(){ + if (bias_multGPU!=NULL) checkCUDA(__LINE__, cudaFree(bias_multGPU)); + }; +}; + +class DropoutLayer: public Layer{ + std::vector dropoutDescs; + std::vector states; + std::vector reserveSpaces; + std::vector stateSizes; + std::vector reserveSpaceSizes; + + std::vector SIZEmask; +public: + ComputeT dropout_rate; + void init() { + // This function is empty for now + }; + DropoutLayer(std::string name_, ComputeT dropout_rate_): Layer(name_), dropout_rate(dropout_rate_){ + init(); + }; + DropoutLayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetValue(json, dropout_rate, 0.5) + init(); + }; + size_t Malloc(Phase phase_){ + dropoutDescs.resize(in.size()); + states.resize(in.size()); + reserveSpaces.resize(in.size()); + stateSizes.resize(in.size()); + reserveSpaceSizes.resize(in.size()); + + for (int i=0;idim); + + out[i]->need_diff = in[i]->need_diff; + out[i]->receptive_field = in[i]->receptive_field; + out[i]->receptive_gap = in[i]->receptive_gap; + out[i]->receptive_offset = in[i]->receptive_offset; + memoryBytes += out[i]->Malloc(in[i]->dim); + } + + std::random_device rd; + + for (int i=0;igetDesc(), &reserveSpaceSizes[i])); + checkCUDA(__LINE__,cudaMalloc(&reserveSpaces[i], reserveSpaceSizes[i])); + memoryBytes += reserveSpaceSizes[i]; + checkCUDNN(__LINE__,cudnnSetDropoutDescriptor(dropoutDescs[i], + cudnnHandle, + dropout_rate, + states[i], + stateSizes[i], + rd())); + } + + return memoryBytes; + }; + ~DropoutLayer(){ + for (int i=0;igetDesc(), + in[i]->dataGPU, + out[i]->getDesc(), + out[i]->dataGPU, + reserveSpaces[i], + reserveSpaceSizes[i] + )); + } + }else{ + for (int i=0;idataGPU, in[i]->dataGPU, sizeofStorageT*SIZEmask[i], cudaMemcpyDeviceToDevice)); + } + } + } + }; + void backward(Phase phase_){ + if ( phase_==Training ){ + for (int i=0;igetDesc(), + out[i]->diffGPU, + in[i]->getDesc(), + in[i]->diffGPU, + reserveSpaces[i], + reserveSpaceSizes[i] + )); + } + }else{ + std::cerr<<"there should be no backward for testing"<diffGPU, out[i]->diffGPU, sizeofStorageT*SIZEmask[i], cudaMemcpyDeviceToDevice)); + } + } + } + }; +}; + +class SoftmaxLayer : public Layer { +public: + bool stable_gradient; + + SoftmaxLayer(std::string name_): Layer(name_), stable_gradient(true){}; + + SoftmaxLayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetValue(json, stable_gradient, true) + }; + + size_t Malloc(Phase phase_){ + size_t memoryBytes = 0; + std::cout<< (train_me? "* " : " "); + std::cout<need_diff = in[i]->need_diff; + out[i]->receptive_field = in[i]->receptive_field; + out[i]->receptive_gap = in[i]->receptive_gap; + out[i]->receptive_offset = in[i]->receptive_offset; + memoryBytes += out[i]->Malloc(in[i]->dim); + } + return memoryBytes; + }; + void forward(Phase phase_){ + for (int i=0;idesc, in[i]->dataGPU, + zero, + out[i]->desc, out[i]->dataGPU)); + } + }; + void backward(Phase phase_){ + for (int i=0;ineed_diff){ + if (stable_gradient){ + if (in[i]->diffGPU != out[i]->diffGPU){ + xpy(numel(in[i]->dim), out[i]->diffGPU, in[i]->diffGPU); + } + }else{ + checkCUDNN(__LINE__,cudnnSoftmaxBackward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_INSTANCE, //CUDNN_SOFTMAX_MODE_CHANNEL, + one, + out[i]->desc, out[i]->dataGPU, out[i]->desc, out[i]->diffGPU, + zero, //one, //bbb + in[i]->desc, in[i]->diffGPU)); + } + } + + + } + }; +}; + +class ActivationLayer : public Layer { + cudnnActivationDescriptor_t activationDesc; +public: + cudnnActivationMode_t mode; + + ActivationLayer(std::string name_, cudnnActivationMode_t mode_): Layer(name_), mode(mode_) {}; + + ActivationLayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, mode, CUDNN_ACTIVATION_RELU) + SetValue(json, phase, TrainingTesting) + }; + + ~ActivationLayer() { + checkCUDNN(__LINE__,cudnnDestroyActivationDescriptor(activationDesc)); + } + + size_t Malloc(Phase phase_){ + size_t memoryBytes = 0; + std::cout<< (train_me? "* " : " "); + std::cout<need_diff = in[i]->need_diff; + out[i]->receptive_field = in[i]->receptive_field; + out[i]->receptive_gap = in[i]->receptive_gap; + out[i]->receptive_offset = in[i]->receptive_offset; + memoryBytes += out[i]->Malloc(in[i]->dim); + } + return memoryBytes; + }; + void forward(Phase phase_){ + for (int i=0;idesc, in[i]->dataGPU, + zero, + out[i]->desc, out[i]->dataGPU)); + } + }; + void backward(Phase phase_){ + for (int i=0;ineed_diff){ + checkCUDNN(__LINE__,cudnnActivationBackward(cudnnHandle, + activationDesc, + one, + out[i]->desc, out[i]->dataGPU, out[i]->desc, out[i]->diffGPU, + in[i]->desc, in[i]->dataGPU, + zero, //one, //bbb + in[i]->desc, in[i]->diffGPU)); + } + } + }; +}; + +class PoolingLayer : public Layer { + cudnnPoolingDescriptor_t desc; +public: + cudnnPoolingMode_t mode; + std::vector window; + std::vector padding; + std::vector stride; + + void init(){ + checkCUDNN(__LINE__,cudnnCreatePoolingDescriptor(&desc) ); + checkCUDNN(__LINE__,cudnnSetPoolingNdDescriptor(desc, + mode, + CUDNN_PROPAGATE_NAN, + window.size(), + &window[0], + &padding[0], + &stride[0])); + }; + + PoolingLayer(std::string name_, cudnnPoolingMode_t mode_, std::vector window_, std::vector padding_, std::vector stride_): Layer(name_), mode(mode_), window(window_), padding(padding_), stride(stride_){ + init(); + }; + + PoolingLayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetValue(json, mode, CUDNN_POOLING_MAX) + SetOrDie(json, window ) + std::vector zeros = std::vector(window.size(),0); + SetValue(json, padding, zeros) + SetValue(json, stride, window) + + init(); + }; + + size_t Malloc(Phase phase_){ + size_t memoryBytes=0; + std::cout<< (train_me? "* " : " "); + std::cout<need_diff = in[i]->need_diff; + + // compute the size to allocate memory + std::vector dimOut(in[i]->dim.size()); + dimOut[0] = in[i]->dim[0]; // size of mini-bath + dimOut[1] = in[i]->dim[1]; // channels + for (int d=2;ddim.size();++d){ + dimOut[d] = 1 + static_cast(ceil(static_cast(in[i]->dim[d] + 2*padding[d-2] - window[d-2])/stride[d-2])); + } + + size_t dall = in[i]->receptive_field.size(); + out[i]->receptive_field .resize(dall); + out[i]->receptive_gap .resize(dall); + out[i]->receptive_offset.resize(dall); + for(size_t d=0;dreceptive_field[d] = in[i]->receptive_field[d] + ComputeT(window[d]-1) * in[i]->receptive_gap[d]; + out[i]->receptive_gap[d] = stride[d] * in[i]->receptive_gap[d]; + out[i]->receptive_offset[d] = in[i]->receptive_offset[d] - ComputeT(padding[d]) * in[i]->receptive_gap[d]; + } + + memoryBytes += out[i]->Malloc(dimOut); + } + return memoryBytes; + }; + void forward(Phase phase_){ + for (int i=0;idesc, in[i]->dataGPU, + zero, + out[i]->desc, out[i]->dataGPU)); + + } + }; + void backward(Phase phase_){ + for (int i=0;ineed_diff){ + checkCUDNN(__LINE__,cudnnPoolingBackward(cudnnHandle, + desc, + one, + out[i]->desc, out[i]->dataGPU, out[i]->desc, out[i]->diffGPU, + in[i]->desc, in[i]->dataGPU, + one, //zero, //one, //bbb + in[i]->desc, in[i]->diffGPU)); + } + } + }; + ~PoolingLayer(){ + checkCUDNN(__LINE__,cudnnDestroyPoolingDescriptor(desc) ); + }; +}; + + +class LRNLayer : public Layer { + cudnnLRNDescriptor_t desc; +public: + LRN mode; + unsigned int local_size; + ComputeT alpha; + ComputeT beta; + ComputeT k; + + void init(){ + if (local_sizeCUDNN_LRN_MAX_N){ std::cout<<"LRN local_size out of range ["<< CUDNN_LRN_MIN_N <<","<< CUDNN_LRN_MAX_N <<"]: local_size="<need_diff = in[i]->need_diff; + out[i]->receptive_field = in[i]->receptive_field; + out[i]->receptive_gap = in[i]->receptive_gap; + out[i]->receptive_offset = in[i]->receptive_offset; + memoryBytes += out[i]->Malloc(in[i]->dim); + } + return memoryBytes; + }; + + void forward(Phase phase_){ + for (int i=0;idesc, in[i]->dataGPU, + zero, + out[i]->desc, out[i]->dataGPU)); + break; + case DivisiveNormalization: +#ifdef CUDNN_DivisiveNormalization + // What is the Best Multi-Stage Architecture for Object Recognition? + // http://yann.lecun.com/exdb/publis/pdf/jarrett-iccv-09.pdf + std::cout<<"Not implemented yet"<desc, in[i]->dataGPU, + srcMeansData, tempData, tempData2, + zero, + out[i]->desc, out[i]->dataGPU)); +#endif + break; + } + } + }; + + void backward(Phase phase_){ + for (int i=0;ineed_diff){ + switch(mode){ + case CrossChannel: + checkCUDNN(__LINE__,cudnnLRNCrossChannelBackward(cudnnHandle, desc, CUDNN_LRN_CROSS_CHANNEL_DIM1, + one, + out[i]->desc /*srcDesc*/, out[i]->dataGPU /*srcData*/, + out[i]->desc /*srcDiffDesc*/, out[i]->diffGPU /*srcDiffData*/, + in[i]->desc /*destDesc*/, in[i]->dataGPU /*destData*/, + zero, //one, //bbb + in[i]->desc /*destDiffDesc*/, in[i]->diffGPU /*destDiffData*/)); + break; + case DivisiveNormalization: +#ifdef CUDNN_DivisiveNormalization + std::cout<<"Not implemented yet"<desc /*srcDesc*/, out[i]->dataGPU /*srcData*/, srcMeansData /*srcMeansData*/, + out[i]->diffGPU /*srcDiffData*/, + tempData /*tempData*/, tempData2 /*tempData2*/, + zero, //one, //bbb + in[i]->desc /*destDataDesc*/, in[i]->diffGPU /*destDataDiff*/, + destMeansDiff /*destMeansDiff*/)); +#endif + break; + } + } + } + }; +}; + + +class ReshapeLayer: public Layer { +public: + std::vector shape; + ReshapeLayer(std::string name_, Phase phase_): Layer(name_){ + phase = phase_; + }; + ReshapeLayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetOrDie(json, shape) + bool remainExist = false; + for(int d=0;dneed_diff = in[i]->need_diff; + std::vector dim; + for(int d=0;ddim[d]); + }else if (shape[d]==-1){ + dim.push_back(-1); + }else{ + dim.push_back(shape[d]); + } + } + int remain = numel(in[i]->dim)/numel(dim); + if (remain!=1){ + remain = -remain; + for(int d=0;dreceptive_field = in[i]->receptive_field; + out[i]->receptive_gap = in[i]->receptive_gap; + out[i]->receptive_offset = in[i]->receptive_offset; + memoryBytes += out[i]->Malloc(dim); + } + return memoryBytes; + }; + + void forward(Phase phase_){ + for (int i=0;idataGPU, in[i]->dataGPU, in[i]->numBytes(), cudaMemcpyDeviceToDevice)); + } + }; + void backward(Phase phase_){ + for(int i=0;ineed_diff){ + size_t N = numel(in[i]->dim); + Kernel_elementwise_acc<<>>(CUDA_GET_LOOPS(N), N, in[i]->diffGPU, out[i]->diffGPU); + } + } + }; +}; + +class ROILayer: public Layer { +public: + std::vector shape; + ROILayer(std::string name_, Phase phase_): Layer(name_){ + phase = phase_; + }; + ROILayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetOrDie(json, shape) + }; + size_t Malloc(Phase phase_){ + size_t memoryBytes = 0; + std::cout<< (train_me? "* " : " "); + std::cout<dim.size() != shape.size()+1) { std::cout<need_diff = in[i*2]->need_diff; + + if (! (in[i*2+1]->dim[0] == in[i*2]->dim[0] && sizeofitem(in[i*2+1]->dim) == shape.size())){ + std::cout<dim is wrong"< dim; + dim.push_back(in[i*2]->dim[0]); + + for(int d=0;ddim[d+1]); + }else{ + dim.push_back(shape[d]); + } + } + + out[i]->receptive_field = in[i*2]->receptive_field; + out[i]->receptive_gap = in[i*2]->receptive_gap; + out[i]->receptive_offset = in[i*2]->receptive_offset; + memoryBytes += out[i]->Malloc(dim); + } + return memoryBytes; + }; + + void forward(Phase phase_){ + for (int i=0;idim); + switch(shape.size()){ + case 3: + Kernel_ROIforward_2D<<>>(CUDA_GET_LOOPS(N), N, out[i]->dataGPU, in[i*2]->dataGPU, in[i*2+1]->dataGPU, out[i]->dim[1], out[i]->dim[2], out[i]->dim[3], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3]); + break; + case 4: + Kernel_ROIforward_3D<<>>(CUDA_GET_LOOPS(N), N, out[i]->dataGPU, in[i*2]->dataGPU, in[i*2+1]->dataGPU, out[i]->dim[1], out[i]->dim[2], out[i]->dim[3], out[i]->dim[4], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], in[i*2]->dim[4]); + break; + case 5: + Kernel_ROIforward_4D<<>>(CUDA_GET_LOOPS(N), N, out[i]->dataGPU, in[i*2]->dataGPU, in[i*2+1]->dataGPU, out[i]->dim[1], out[i]->dim[2], out[i]->dim[3], out[i]->dim[4], out[i]->dim[5], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], in[i*2]->dim[4], in[i*2]->dim[5]); + break; + default: + std::cerr<<"Haven't implemented yet"<need_diff){ + size_t N = numel(out[i]->dim); + switch(shape.size()){ + case 3: + Kernel_ROIbackward_2D<<>>(CUDA_GET_LOOPS(N), N, out[i]->diffGPU, in[i*2]->diffGPU, in[i*2+1]->dataGPU, out[i]->dim[1], out[i]->dim[2], out[i]->dim[3], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3]); + break; + case 4: + Kernel_ROIbackward_3D<<>>(CUDA_GET_LOOPS(N), N, out[i]->diffGPU, in[i*2]->diffGPU, in[i*2+1]->dataGPU, out[i]->dim[1], out[i]->dim[2], out[i]->dim[3], out[i]->dim[4], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], in[i*2]->dim[4]); + break; + case 5: + Kernel_ROIbackward_4D<<>>(CUDA_GET_LOOPS(N), N, out[i]->diffGPU, in[i*2]->diffGPU, in[i*2+1]->dataGPU, out[i]->dim[1], out[i]->dim[2], out[i]->dim[3], out[i]->dim[4], out[i]->dim[5], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], in[i*2]->dim[4], in[i*2]->dim[5]); + break; + default: + std::cerr<<"Haven't implemented yet"< GPUIndex; +public: + ComputeT spatial_scale; + std::vector shape; + ROIPoolingLayer(std::string name_, Phase phase_): Layer(name_){ + phase = phase_; + }; + ROIPoolingLayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetOrDie(json, shape) + SetOrDie(json, spatial_scale) + }; + ~ROIPoolingLayer(){ + for (int i=0;idim.size() != shape.size()+2) { std::cout<need_diff = in[i*2]->need_diff; + + if ( sizeofitem(in[i*2+1]->dim) != 1 + 2 * shape.size() ){ + std::cout<dim is wrong"< dim; + dim.push_back(in[i*2+1]->dim[0]); // number of boxes + dim.push_back(in[i*2]->dim[1]); // number of channels from convolutions + for(int d=0;dMalloc(dim); + + if (in[i*2]->need_diff){ + checkCUDA(__LINE__, cudaMalloc(&GPUIndex[i], numel(out[i]->dim) * sizeof(size_t)) ); + memoryBytes += numel(out[i]->dim) * sizeof(size_t); + } + } + return memoryBytes; + }; + + void forward(Phase phase_){ + for (int i=0;idim); + switch(shape.size()){ + case 2: + Kernel_ROIPoolForward_2D<<>>(CUDA_GET_LOOPS(N), N, in[i*2]->dataGPU, in[i*2+1]->dataGPU, out[i]->dataGPU, GPUIndex[i], spatial_scale, in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], shape[0], shape[1]); + break; + case 3: + Kernel_ROIPoolForward_3D<<>>(CUDA_GET_LOOPS(N), N, in[i*2]->dataGPU, in[i*2+1]->dataGPU, out[i]->dataGPU, GPUIndex[i], spatial_scale, in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], in[i*2]->dim[4], shape[0], shape[1], shape[2]); + break; + default: + std::cerr<<"Haven't implemented yet"<need_diff){ + size_t N = numel(in[i*2]->dim); + switch(shape.size()){ + case 2: + Kernel_ROIPoolBackward_2D<<>>(CUDA_GET_LOOPS(N), N, in[i*2]->diffGPU, in[i*2+1]->dataGPU, out[i]->diffGPU, GPUIndex[i], spatial_scale, in[i*2+1]->dim[0], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], shape[0], shape[1]); + break; + case 3: + Kernel_ROIPoolBackward_3D<<>>(CUDA_GET_LOOPS(N), N, in[i*2]->diffGPU, in[i*2+1]->dataGPU, out[i]->diffGPU, GPUIndex[i], spatial_scale, in[i*2+1]->dim[0], in[i*2]->dim[1], in[i*2]->dim[2], in[i*2]->dim[3], in[i*2]->dim[4], shape[0], shape[1], shape[2]); + break; + default: + std::cerr<<"Haven't implemented yet"< coeff; + + ElementWiseLayer(std::string name_, ElementWiseOp mode_, bool last_in_is_coeff_=false): Layer(name_), mode(mode_), last_in_is_coeff(last_in_is_coeff_){ + }; + ElementWiseLayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetOrDie(json, mode) + SetValue(json, last_in_is_coeff, false) + SetValue(json, coeff, std::vector()) + }; + size_t Malloc(Phase phase_){ + size_t memoryBytes = 0; + std::cout<< (train_me? "* " : " "); + std::cout<(last_in_is_coeff? in_group-1: in_group,1); + + for(int j=0;jneed_diff = false; + for(int i=j*in_group; i<(j+1)*in_group;i++){ + if (in[i]->need_diff){ + out[j]->need_diff = true; + break; + } + } + + out[j]->receptive_field = in[j*in_group]->receptive_field; + out[j]->receptive_gap = in[j*in_group]->receptive_gap; + out[j]->receptive_offset = in[j*in_group]->receptive_offset; + for(int i=j*in_group+1; i<(j+1)*in_group;i++){ + for(size_t d=0; dreceptive_field.size();++d){ + out[j]->receptive_field [d] = max(out[j]->receptive_field [d],in[i]->receptive_field [d]); + out[j]->receptive_gap [d] = max(out[j]->receptive_gap [d],in[i]->receptive_gap [d]); + out[j]->receptive_offset [d] = max(out[j]->receptive_offset [d],in[i]->receptive_offset [d]); + } + } + + memoryBytes += out[j]->Malloc(in[j*in_group]->dim); + } + return memoryBytes; + }; + void forward(Phase phase_){ + switch(mode){ + case ElementWise_EQL: + for (int i=0;idim); + GPU_set_ones(n, out[i]->dataGPU); + for (int j=i*in_group+1; j<(i+1)*in_group; ++j){ + GPU_elementwise_comparison(n, out[i]->dataGPU, in[i*in_group]->dataGPU, in[j]->dataGPU); + } + } + break; + case ElementWise_MUL: std::cout<<"Not implemented yet"<dataGPU : NULL; + size_t N = numel(in[i]->dim); + size_t items = in[i]->dim[0]; + size_t dim = in[i]->sizeofitem(); + CoeffElementWiseSumReplace<<>>(CUDA_GET_LOOPS(N), N, coeff[0], coeff_data, 0 * items, dim, in[i]->dataGPU, out[j]->dataGPU); + for (i=i+1; i<(j+1)*in_group - last_in_is_coeff; ++i){ + size_t ii = i-j*in_group; + CoeffElementWiseSumAccumulate<<>>(CUDA_GET_LOOPS(N), N, coeff[ii], coeff_data, ii * items, dim, in[i]->dataGPU, out[j]->dataGPU); + } + } + break; + case ElementWise_MAX: std::cout<<"Not implemented yet"<dataGPU : NULL; + size_t N = numel(in[i]->dim); + size_t items = in[i]->dim[0]; + size_t dim = in[i]->sizeofitem(); + for (; i<(j+1)*in_group - last_in_is_coeff; ++i){ + size_t ii = i-j*in_group; + CoeffElementWiseSumAccumulate<<>>(CUDA_GET_LOOPS(N), N, coeff[ii], coeff_data, ii * items, dim, out[j]->diffGPU, in[i]->diffGPU); + } + } + break; + case ElementWise_MAX: std::cout<<"Not implemented yet"<need_diff = false; + for(int i=j*in_group; i<(j+1)*in_group;i++){ + if (in[i]->need_diff){ + out[j]->need_diff = true; + break; + } + } + std::vector dim = in[j*in_group]->dim; + for(int i=j*in_group+1; i<(j+1)*in_group;i++){ + dim[1] += in[i]->dim[1]; + } + + out[j]->receptive_field = in[j*in_group]->receptive_field; + out[j]->receptive_gap = in[j*in_group]->receptive_gap; + out[j]->receptive_offset = in[j*in_group]->receptive_offset; + for(int i=j*in_group+1; i<(j+1)*in_group;i++){ + for(size_t d=0; dreceptive_field.size();++d){ + out[j]->receptive_field[d] = max(out[j]->receptive_field [d],in[i]->receptive_field [d]); + out[j]->receptive_gap [d] = max(out[j]->receptive_gap [d],in[i]->receptive_gap [d]); + out[j]->receptive_offset[d] = min(out[j]->receptive_offset[d],in[i]->receptive_offset[d]); + } + } + + memoryBytes += out[j]->Malloc(dim); + } + return memoryBytes; + }; + void forward(Phase phase_){ + for(int j=0;jdim[0]; + for(int i=j*in_group; i<(j+1)*in_group;i++){ + copyGPUforward (numofitems, in[i]->dataGPU, out[j]->dataGPU, sizeofitem(in[i]->dim), sizeofitem(out[j]->dim), offset); + offset += sizeofitem(in[i]->dim); + } + } + }; + void backward(Phase phase_){ + for(int j=0;jdim[0]; + for(int i=j*in_group; i<(j+1)*in_group;i++){ + if (in[i]->need_diff){ + copyGPUbackward(numofitems, in[i]->diffGPU, out[j]->diffGPU, sizeofitem(in[i]->dim), sizeofitem(out[j]->dim), offset); + } + offset += sizeofitem(in[i]->dim); + } + } + }; +}; + + +class LossLayer : public Layer { + StorageT* loss_values; + StorageT* loss_weightsGPU; + size_t loss_numel; + int numExamples; + ComputeT scale; +public: + ComputeT result; + ComputeT loss; + + + LossObjective mode; + ComputeT loss_weight; + std::vector loss_weights; + ComputeT margin; + + LossLayer(std::string name_, LossObjective mode_, ComputeT loss_weight_) + : Layer(name_), mode(mode_), loss_weight(loss_weight_), + loss_values(NULL), loss_weightsGPU(NULL) { + train_me = false; + }; + + LossLayer(JSON* json): loss_values(NULL), loss_weightsGPU(NULL){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetOrDie(json, mode) + SetValue(json, loss_weight, 1) + SetValue(json, margin, 1) + + SetValue(json, loss_weights, std::vector()) + train_me = false; + }; + + ~LossLayer() { + if (loss_values != NULL) + checkCUDA(__LINE__, cudaFree(loss_values)); + if (loss_weightsGPU != NULL) + checkCUDA(__LINE__, cudaFree(loss_weightsGPU)); + }; + + size_t Malloc(Phase phase_) { + std::cout << (train_me ? "* " : " "); + std::cout << name << std::endl; + + size_t memoryBytes = 0; + + numExamples = in[0]->dim[0]; + + switch (mode) { + case MultinomialLogistic_StableSoftmax: + case MultinomialLogistic: + if (!(in.size() == 2 || in.size() == 3)) { + std::cout << + "LossLayer: MultinomialLogistic should have 2 or 3 ins" << + std::endl; + FatalError(__LINE__); + } + if (!same_dim_EC(in[0]->dim, in[1]->dim)) { + std::cout << + "LossLayer: MultinomialLogistic should have the same dimensions except channels" << + std::endl; + FatalError(__LINE__); + } + if (in[1]->dim[1] != 1) { + std::cout << + "LossLayer: MultinomialLogistic in[1] should have only 1 channel" << + std::endl; + FatalError(__LINE__); + } + if (in.size() == 3 && !(numel(in[0]->dim) == numel(in[2]->dim) || + sizeofitem(in[0]->dim) == + numel(in[2]->dim))) { + std::cout << + "LossLayer: MultinomialLogistic in[2] size should be either the same with in[0] or should be the same with sizeofitem for in[0]" << + std::endl; + FatalError(__LINE__); + } + loss_numel = numExamples * numspel(in[0]->dim); + break; + case SmoothL1: + if (!(in.size() == 2 || in.size() == 3)) { + std::cout << "LossLayer: SmoothL1 should have 2 or 3 ins" << + std::endl; + FatalError(__LINE__); + } + if (!same_dim(in[0]->dim, in[1]->dim)) { + std::cout << + "LossLayer: SmoothL1 should have the same dimensions" << + std::endl; + FatalError(__LINE__); + } + if (in.size() == 3 && !same_dim(in[0]->dim, in[2]->dim)) { + std::cout << + "LossLayer: SmoothL1 should have the same dimensions" << + std::endl; + FatalError(__LINE__); + } + loss_numel = numel(in[0]->dim); + break; + case Contrastive: + loss_numel = numExamples; + break; + case EuclideanSSE: + if (!(in.size() == 2 || in.size() == 3)) { + std::cout << "LossLayer: EuclideanSSE should have 2 or 3 ins" << + std::endl; + FatalError(__LINE__); + } + if (!same_dim(in[0]->dim, in[1]->dim)) { + std::cout << + "LossLayer: EuclideanSSE should have the same dimensions" << + std::endl; + FatalError(__LINE__); + } + if (in.size() == 3 && !same_dim(in[0]->dim, in[2]->dim)) { + std::cout << + "LossLayer: EuclideanSSE should have the same dimensions" << + std::endl; + FatalError(__LINE__); + } + loss_numel = numel(in[0]->dim); + break; + case HingeL1: + break; + case HingeL2: + break; + case SigmoidCrossEntropy: + break; + case Infogain: + break; + } + scale = loss_weight / loss_numel; + + memoryBytes += loss_numel * sizeofStorageT; + checkCUDA(__LINE__, cudaMalloc(&loss_values, memoryBytes)); + + + if (loss_weights.size() > 0) { + size_t newBytes = loss_weights.size() * sizeofStorageT; + checkCUDA(__LINE__, cudaMalloc(&loss_weightsGPU, newBytes)); + memoryBytes += newBytes; + + StorageT *CPUram = new StorageT[loss_weights.size()]; + for (int i = 0; i < loss_weights.size(); ++i) { + CPUram[i] = CPUCompute2StorageT(loss_weights[i]); + } + checkCUDA(__LINE__, cudaMemcpy(loss_weightsGPU, CPUram, newBytes, + cudaMemcpyHostToDevice)); + delete[] CPUram; + } + + return memoryBytes; + }; + + void display() { + std::cout << " loss = " << loss; + std::cout << " * " << loss_weight; + if (mode == MultinomialLogistic_StableSoftmax || + mode == MultinomialLogistic) + std::cout << " eval = " << result; + std::cout << " "; + }; + + void eval(){ + ComputeT resultSum; + switch(mode){ + case MultinomialLogistic_StableSoftmax: + case MultinomialLogistic: + Accuracy_MultinomialLogistic<<>>( + CUDA_GET_LOOPS(loss_numel), + loss_numel, in[0]->dim[1], numspel(in[0]->dim), + (in.size()==3 ? numel(in[2]->dim) : 0), + in[0]->dataGPU, in[1]->dataGPU, loss_weightsGPU, + (in.size()==3 ? in[2]->dataGPU : NULL), + loss_values); + checkCUBLAS(__LINE__, GPUasum(cublasHandle, loss_numel, + loss_values, 1, &resultSum)); + result += resultSum / loss_numel; + Loss_MultinomialLogistic<<>>( + CUDA_GET_LOOPS(loss_numel), + loss_numel, in[0]->dim[1], numspel(in[0]->dim), + (in.size()==3 ? numel(in[2]->dim) : 0), + in[0]->dataGPU, in[1]->dataGPU, loss_weightsGPU, + (in.size()==3 ? in[2]->dataGPU : NULL), + loss_values); + break; + case SmoothL1: + Loss_SmoothL1<<>>( + CUDA_GET_LOOPS(loss_numel), + loss_numel, in[0]->dataGPU, in[1]->dataGPU, + (in.size()==3 ? in[2]->dataGPU : NULL), loss_values); + break; + case Contrastive: + Loss_Contrastive<<>>(CUDA_GET_LOOPS(loss_numel), + loss_numel, in[0]->dim[1], + margin,in[0]->dataGPU, in[1]->dataGPU, in[2]->dataGPU, + loss_values); + break; + case EuclideanSSE: + Loss_EuclideanSSE<<>>( + CUDA_GET_LOOPS(loss_numel), + loss_numel, in[0]->dataGPU, in[1]->dataGPU, + (in.size()==3 ? in[2]->dataGPU : NULL), loss_values); + break; + case HingeL1: + break; + case HingeL2: + break; + case SigmoidCrossEntropy: + break; + case Infogain: + break; + } + ComputeT lossSum; + checkCUBLAS(__LINE__, GPUasum(cublasHandle, loss_numel, + loss_values, 1, &lossSum)); + loss += lossSum/loss_numel; + }; + + + void backward(Phase phase_){ + // either write this in Cuda or get both the prediction and ground truth to CPU and do the computation and write the diff back to GPU + if (in[0]->need_diff){ + switch(mode){ + case MultinomialLogistic_StableSoftmax: + LossGrad_MultinomialLogistic_StableSoftmax<<< + CUDA_GET_BLOCKS(loss_numel), CUDA_NUM_THREADS>>>( + CUDA_GET_LOOPS(loss_numel), + loss_numel, in[0]->dim[1], numspel(in[0]->dim), + (in.size()==3 ? numel(in[2]->dim) : 0), scale, + in[0]->dataGPU, in[1]->dataGPU, loss_weightsGPU, + (in.size()==3 ? in[2]->dataGPU : NULL), + in[0]->diffGPU); + break; + case MultinomialLogistic: + LossGrad_MultinomialLogistic<<< + CUDA_GET_BLOCKS(loss_numel), CUDA_NUM_THREADS>>>( + CUDA_GET_LOOPS(loss_numel), + loss_numel, in[0]->dim[1], numspel(in[0]->dim), + (in.size()==3 ? numel(in[2]->dim) : 0), scale, + in[0]->dataGPU, in[1]->dataGPU, loss_weightsGPU, + (in.size()==3 ? in[2]->dataGPU : NULL), + in[0]->diffGPU); + break; + case SmoothL1: + LossGrad_SmoothL1<<>>( + CUDA_GET_LOOPS(loss_numel), + loss_numel, scale, in[0]->dataGPU, in[1]->dataGPU, + (in.size()==3 ? in[2]->dataGPU : NULL), + in[0]->diffGPU); + break; + case Contrastive: + LossGrad_Contrastive<<>>( + CUDA_GET_LOOPS(loss_numel), + loss_numel, in[0]->dim[1], margin, scale, + in[0]->dataGPU, in[1]->dataGPU, in[2]->dataGPU, + in[0]->diffGPU, in[1]->diffGPU); + break; + case EuclideanSSE: + LossGrad_EuclideanSSE<<>>( + CUDA_GET_LOOPS(loss_numel), + loss_numel, scale, in[0]->dataGPU, in[1]->dataGPU, + (in.size()==3 ? in[2]->dataGPU : NULL), + in[0]->diffGPU); + break; + case HingeL1: + break; + case HingeL2: + break; + case SigmoidCrossEntropy: + break; + case Infogain: + break; + } + + } + }; +}; + + +/* ---------------------------------------------------------------------------- + * The following LSTM implementation are largely based on LRCN on Caffe. + * + * Project page: http://jeffdonahue.com/lrcn/ + * GitHub page: https://github.com/LisaAnne/lisa-caffe-public + * License page: https://github.com/BVLC/caffe/blob/master/LICENSE + * ---------------------------------------------------------------------------- + */ + +__device__ ComputeT sigmoid(const ComputeT x) { + return ComputeT(1) / (ComputeT(1) + exp(-x)); +} + +__device__ ComputeT tanh(const ComputeT x) { + return ComputeT(2) * sigmoid(ComputeT(2) * x) - ComputeT(1); +} + +__global__ void LSTMActsForward(size_t CUDA_NUM_LOOPS, size_t N, const int dim, const StorageT* X, StorageT* X_acts) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t index = idxBase; index < min(N,idxBase+CUDA_NUM_LOOPS); ++index ){ + const int x_dim = 4 * dim; + const int d = index % x_dim; + if (d < 3 * dim) { + X_acts[index] = GPUCompute2StorageT(sigmoid(GPUStorage2ComputeT(X[index]))); + } else { + X_acts[index] = GPUCompute2StorageT(tanh(GPUStorage2ComputeT(X[index]))); + } + } +} + +__global__ void LSTMUnitForward(size_t CUDA_NUM_LOOPS, size_t N, const size_t dim, const StorageT* C_prev, const StorageT* X, const StorageT* flush, StorageT* C, StorageT* H) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t index = idxBase; index < min(N,idxBase+CUDA_NUM_LOOPS); ++index ){ + const size_t n = index / dim; + const size_t d = index % dim; + const size_t offset = 4 * dim * n; + const ComputeT i = GPUStorage2ComputeT(X[offset + d]); + const ComputeT f = GPUStorage2ComputeT(X[offset + 1 * dim + d]); + const ComputeT o = GPUStorage2ComputeT(X[offset + 2 * dim + d]); + const ComputeT g = GPUStorage2ComputeT(X[offset + 3 * dim + d]); + const ComputeT c = GPUStorage2ComputeT(flush[n]) * f * GPUStorage2ComputeT(C_prev[index]) + i * g; + C[index] = GPUCompute2StorageT(c); + H[index] = GPUCompute2StorageT(o * tanh(c)); + } +} + +__global__ void LSTMUnitBackward(size_t CUDA_NUM_LOOPS, size_t N, const size_t dim, const StorageT* C_prev, const StorageT* X, const StorageT* C, const StorageT* H, const StorageT* flush, const StorageT* C_diff, const StorageT* H_diff, StorageT* C_prev_diff, StorageT* X_diff) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t index = idxBase; index < min(N,idxBase+CUDA_NUM_LOOPS); ++index ){ + const size_t n = index / dim; + const size_t d = index % dim; + const size_t offset = 4 * dim * n; + const ComputeT i = GPUStorage2ComputeT(X[offset + d]); + const ComputeT f = GPUStorage2ComputeT(X[offset + 1 * dim + d]); + const ComputeT o = GPUStorage2ComputeT(X[offset + 2 * dim + d]); + const ComputeT g = GPUStorage2ComputeT(X[offset + 3 * dim + d]); + const ComputeT c_prev = GPUStorage2ComputeT(C_prev[index]); + const ComputeT c = GPUStorage2ComputeT(C[index]); + const ComputeT tanh_c = tanh(c); + + ComputeT h_diff = GPUStorage2ComputeT(H_diff[index]); + const ComputeT c_term_diff = GPUStorage2ComputeT(C_diff[index]) + h_diff * o * (1 - tanh_c * tanh_c); + const ComputeT flush_n = GPUStorage2ComputeT(flush[n]); + + C_prev_diff[index] = GPUCompute2StorageT(flush_n * c_term_diff * f); + const size_t diff_offset = 4 * dim * n; + X_diff[diff_offset + d] = GPUCompute2StorageT(c_term_diff * g); + X_diff[diff_offset + 1 * dim + d] = GPUCompute2StorageT(flush_n * c_term_diff * c_prev); + X_diff[diff_offset + 2 * dim + d] = GPUCompute2StorageT(h_diff * tanh_c); + X_diff[diff_offset + 3 * dim + d] = GPUCompute2StorageT(c_term_diff * i); + } +} + +__global__ void LSTMActsBackward(size_t CUDA_NUM_LOOPS, size_t N, const size_t dim, const StorageT* X_acts, const StorageT* X_acts_diff, StorageT* X_diff) { + const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x)); + if (idxBase >= N) return; + for (size_t index = idxBase; index < min(N,idxBase+CUDA_NUM_LOOPS); ++index ){ + const size_t x_dim = 4 * dim; + const size_t d = index % x_dim; + const ComputeT X_act = GPUStorage2ComputeT(X_acts[index]); + if (d < 3 * dim) { + X_diff[index] = GPUCompute2StorageT(GPUStorage2ComputeT(X_acts_diff[index]) * X_act * (ComputeT(1) - X_act)); + } else { + X_diff[index] = GPUCompute2StorageT(GPUStorage2ComputeT(X_acts_diff[index]) * (ComputeT(1) - X_act * X_act)); + } + } +} + +// A helper for LSTMLayer: computes a single timestep of the non-linearity of the LSTM, producing the updated cell and hidden states. +class LSTMUnitLayer : public Layer { + size_t X_count; + size_t count; + size_t hidden_dim; +public: + StorageT* X_acts; + StorageT* X_acts_diff; + LSTMUnitLayer(std::string name_): Layer(name_), X_acts(NULL), X_acts_diff(NULL){}; + size_t Malloc(Phase phase_) { + std::cout << (train_me ? "* " : " "); + std::cout << name << std::endl; + size_t memoryBytes = 0; + const size_t X_bytes = in[1]->numBytes(); + checkCUDA(__LINE__, cudaMalloc(&X_acts_diff, X_bytes) ); + X_count = numel( in[1]->dim); + count = numel(out[1]->dim); + hidden_dim = numspel(in[0]->dim); + return memoryBytes; + }; + ~LSTMUnitLayer(){ + + if (X_acts_diff!=NULL) checkCUDA(__LINE__, cudaFree(X_acts_diff)); + }; + void forward(Phase phase_){ + LSTMActsForward<<>>(CUDA_GET_LOOPS(X_count), X_count, hidden_dim, in[1]->dataGPU, X_acts); + LSTMUnitForward<<>>(CUDA_GET_LOOPS(count), count, hidden_dim, in[0]->dataGPU, X_acts, in[2]->dataGPU, out[0]->dataGPU, out[1]->dataGPU); + }; + void backward(Phase phase_){ + LSTMUnitBackward<<>>(CUDA_GET_LOOPS(count), count, hidden_dim, in[0]->dataGPU, X_acts, out[0]->dataGPU, out[1]->dataGPU, in[2]->dataGPU, out[0]->diffGPU, out[1]->diffGPU, in[0]->diffGPU, X_acts_diff); + LSTMActsBackward<<>>(CUDA_GET_LOOPS(X_count), X_count, hidden_dim, X_acts, X_acts_diff, in[1]->diffGPU); + }; +}; + +class LSTMLayer : public Layer { + int batch_size_N; + int seq_length_T; + + Response* pResponse_W_xc_x_static; + Response* pResponse_W_xc_x; + std::vector responses_W_xc_x_; + std::vector responses_cont_; + std::vector responses_c_; + std::vector responses_h_; + std::vector responses_h_conted_; + std::vector responses_W_hc_h_; + std::vector responses_gate_input_; + + Response* in0; + + Layer* pLayer_x_transform; + Layer* pLayer_x_static_transform; + Layer* pLayer_h_conted; + Layer* pLayer_transform; + Layer* pLayer_gate_input; + LSTMUnitLayer* pLayer_lstm_unit; + + std::vector X_acts_; + + bool debug_mode; + +public: + int num_output; + + LSTMLayer(std::string name_, + int num_output_, + ComputeT weight_lr_mult_, Filler weight_filler_, ComputeT weight_filler_param_, + ComputeT bias_lr_mult_, Filler bias_filler_, ComputeT bias_filler_param_): Layer(name_),num_output(num_output_),debug_mode(false){ + weight_filler = weight_filler_; + weight_filler_param = weight_filler_param_; + bias_filler = bias_filler_; + bias_filler_param = bias_filler_param_; + weight_lr_mult = weight_lr_mult_; + bias_lr_mult = bias_lr_mult_; + train_me = true; + }; + + LSTMLayer(JSON* json){ + SetOrDie(json, name) + SetValue(json, phase, TrainingTesting) + SetValue(json, train_me, true) + SetValue(json, weight_lr_mult, 1.0) + SetValue(json, weight_filler, Xavier) + SetValue(json, weight_filler_param, 0.0) + SetValue(json, bias_lr_mult, 2.0) + SetValue(json, bias_filler, Constant) + SetValue(json, bias_filler_param, 0.0) + SetValue(json, weight_decay_mult, 1.0) + SetValue(json, bias_decay_mult, 1.0) + SetOrDie(json, num_output ) + SetValue(json, debug_mode, false) + }; + + size_t FillUnrolledNet(Phase phase_){ + + size_t memoryBytes = 0; + std::vector dim; + + // first h_0 is not part of the output + dim.clear(); + dim.push_back(1); + dim.push_back(batch_size_N); + dim.push_back(num_output); + + Response* pResponse_h_0 = new Response("h_0", train_me); + pResponse_h_0->cublasHandle = cublasHandle; + pResponse_h_0->Malloc(dim); + responses_h_.push_back(pResponse_h_0); + + // slice out[0] into h_[1...T] + size_t items_h_t = numel(dim); + for (int t=1;tcublasHandle = cublasHandle; + pResponse_h_t->Malloc(dim, out[0]->dataGPU + items_h_t * (t-1), out[0]->diffGPU + items_h_t * (t-1)); + responses_h_.push_back(pResponse_h_t); + } + + // slice cont into cont_[t] + dim = in[1]->dim; + dim[0] = 1; + size_t items_cont_t = numel(dim); //in[1]->sizeofitem(); + //std::cout<<"bytes_cont_t="<cublasHandle = cublasHandle; + pResponse_cont_t->Malloc(dim, in[1]->dataGPU + items_cont_t*t, NULL); + //std::cout<<"pResponse_cont_t="<dataGPU<dim; + dim.erase(dim.begin()); + dim[0] *= in[0]->dim[0]; + while (dim.size()<3) dim.push_back(1); + in0 = new Response(this->in[0]->name+"_proxy", in[0]->need_diff); + in0->cublasHandle = cublasHandle; + in0->Malloc(dim, in[0]->dataGPU, in[0]->diffGPU); + + // Add layer to transform all timesteps of x to the hidden state dimension. + // W_xc_x = W_xc * x + b_c + pLayer_x_transform = new InnerProductLayer("x_transform", num_output*4, true, weight_lr_mult, weight_filler, weight_filler_param, bias_lr_mult, bias_filler, bias_filler_param); + pLayer_x_transform->cudnnHandle = cudnnHandle; + pLayer_x_transform->cublasHandle = cublasHandle; + pLayer_x_transform->GPU = GPU; + pLayer_x_transform->addIn(in0); + pResponse_W_xc_x = new Response("W_xc_x", train_me); + pResponse_W_xc_x->cublasHandle = cublasHandle; + pLayer_x_transform->addOut(pResponse_W_xc_x); + memoryBytes += pLayer_x_transform->Malloc(phase_); + sub_layers.push_back(pLayer_x_transform); + + // slice W_xc_x into W_xc_x_[t] + dim.clear(); + dim.push_back(1); + dim.push_back(in[0]->dim[1]); + dim.push_back(num_output*4); + size_t items_W_xc_x_t = numel(dim); + for (int t=0;tcublasHandle = cublasHandle; + pResponse_W_xc_x_t->Malloc(dim, pResponse_W_xc_x->dataGPU + items_W_xc_x_t*t, pResponse_W_xc_x->diffGPU + items_W_xc_x_t*t); + responses_W_xc_x_.push_back(pResponse_W_xc_x_t); + } + + if (in.size()>2){ + // Add layer to transform x_static to the gate dimension. + // W_xc_x_static = W_xc_static * x_static + pLayer_x_static_transform = new InnerProductLayer("W_xc_x_static", num_output*4, false, weight_lr_mult, weight_filler, weight_filler_param, bias_lr_mult, bias_filler, bias_filler_param); + pLayer_x_static_transform->cudnnHandle = cudnnHandle; + pLayer_x_static_transform->cublasHandle = cublasHandle; + pLayer_x_static_transform->GPU = GPU; + pLayer_x_static_transform ->addIn(this->in[2]); + pResponse_W_xc_x_static = new Response("W_xc_x_static", train_me); + pResponse_W_xc_x_static->cublasHandle = cublasHandle; + pLayer_x_static_transform ->addOut(pResponse_W_xc_x_static); + memoryBytes += pLayer_x_static_transform->Malloc(phase_); + sub_layers.push_back(pLayer_x_static_transform); + } + + dim.clear(); + dim.push_back(1); + dim.push_back(batch_size_N); + dim.push_back(num_output); + // all c + for (int t=0;tcublasHandle = cublasHandle; + pResponse_c_t->Malloc(dim); + responses_c_.push_back(pResponse_c_t); + } + + dim.clear(); + dim.push_back(batch_size_N); + dim.push_back(num_output); + dim.push_back(1); + // all h_conted_ + for (int t=0;tcublasHandle = cublasHandle; + pResponse_h_conted_t->Malloc(dim); + responses_h_conted_.push_back(pResponse_h_conted_t); + } + + reset(); + + dim.clear(); + dim.push_back(1); + dim.push_back(batch_size_N); + dim.push_back(num_output*4); + //responses_W_hc_h_ + for (int t=0;tcublasHandle = cublasHandle; + pResponse_W_hc_h_t->Malloc(dim); + responses_W_hc_h_.push_back(pResponse_W_hc_h_t); + } + + dim.clear(); + dim.push_back(batch_size_N); + dim.push_back(4); + dim.push_back(num_output); + + //responses_gate_input_ + for (int t=0;tcublasHandle = cublasHandle; + pResponse_gate_input_t->Malloc(dim); + responses_gate_input_.push_back(pResponse_gate_input_t); + } + + size_t X_bytes = sizeofStorageT * numel(dim); + for (int t=0;t