This example shows how to generate CUDA® code from a deep learning network, represented by a SeriesNetwork object. The SeriesNetwork in this example is a convolutional neural network that can detect and output lane marker boundaries from image.
CUDA enabled NVIDIA® GPU with compute capability 3.2 or higher.
NVIDIA CUDA toolkit and driver.
NVIDIA cuDNN library (v7).
OpenCV 3.1.0 libraries for video read and image display operations.
Environment variables for the compilers and libraries. For information on the supported versions of the compilers and libraries, see Third-party Products. For setting up the environment variables, see Environment Variables.
Deep Learning Toolbox™ for using SeriesNetwork objects.
GPU Coder™ for generating CUDA code.
GPU Coder Interface for Deep Learning Libraries support package. To install this support package, use the Add-On Explorer.
Use the coder.checkGpuInstall function and verify that the compilers and libraries needed for running this example are set up correctly.
coder.checkGpuInstall('gpu','codegen','cudnn','quiet');
[laneNet, coeffMeans, coeffStds] = getLaneDetectionNetwork();
This network takes image as an input and outputs two lane boundaries that correspond to the left and right lanes of the ego vehicle. Each lane boundary can be represented by the parabolic equation: , where y is the lateral offset and x is the longitudinal distance from the vehicle. The network outputs the three parameters a,b, and c per lane. The network architecture is similar to AlexNet, except that the last few layers are replaced by a smaller fully connected layer and regression output layer.
laneNet.Layers
ans = 23x1 Layer array with layers: 1 'data' Image Input 227x227x3 images with 'zerocenter' normalization 2 'conv1' Convolution 96 11x11x3 convolutions with stride [4 4] and padding [0 0 0 0] 3 'relu1' ReLU ReLU 4 'norm1' Cross Channel Normalization cross channel normalization with 5 channels per element 5 'pool1' Max Pooling 3x3 max pooling with stride [2 2] and padding [0 0 0 0] 6 'conv2' Convolution 256 5x5x48 convolutions with stride [1 1] and padding [2 2 2 2] 7 'relu2' ReLU ReLU 8 'norm2' Cross Channel Normalization cross channel normalization with 5 channels per element 9 'pool2' Max Pooling 3x3 max pooling with stride [2 2] and padding [0 0 0 0] 10 'conv3' Convolution 384 3x3x256 convolutions with stride [1 1] and padding [1 1 1 1] 11 'relu3' ReLU ReLU 12 'conv4' Convolution 384 3x3x192 convolutions with stride [1 1] and padding [1 1 1 1] 13 'relu4' ReLU ReLU 14 'conv5' Convolution 256 3x3x192 convolutions with stride [1 1] and padding [1 1 1 1] 15 'relu5' ReLU ReLU 16 'pool5' Max Pooling 3x3 max pooling with stride [2 2] and padding [0 0 0 0] 17 'fc6' Fully Connected 4096 fully connected layer 18 'relu6' ReLU ReLU 19 'drop6' Dropout 50% dropout 20 'fcLane1' Fully Connected 16 fully connected layer 21 'fcLane1Relu' ReLU ReLU 22 'fcLane2' Fully Connected 6 fully connected layer 23 'output' Regression Output mean-squared-error with 'leftLane_a', 'leftLane_b', and 4 other responses
type detect_lane.m
function [laneFound, ltPts, rtPts] = detect_lane(frame, laneCoeffMeans, laneCoeffStds) % From the networks output, % compute left and right lane points in the image coordinates % The camera coordinates are described by the caltech mono camera model. %#codegen persistent lanenet; if isempty(lanenet) lanenet = coder.loadDeepLearningNetwork('laneNet.mat', 'lanenet'); end lanecoeffsNetworkOutput = lanenet.predict(permute(frame, [2 1 3])); % Recover original coeffs by reversing the normalization steps params = lanecoeffsNetworkOutput .* laneCoeffStds + laneCoeffMeans; isRightLaneFound = abs(params(6)) > 0.5; %c should be more than 0.5 for it to be a right lane isLeftLaneFound = abs(params(3)) > 0.5; vehicleXPoints = 3:30; %meters, ahead of the sensor ltPts = coder.nullcopy(zeros(28,2,'single')); rtPts = coder.nullcopy(zeros(28,2,'single')); if isRightLaneFound && isLeftLaneFound rtBoundary = params(4:6); rt_y = computeBoundaryModel(rtBoundary, vehicleXPoints); ltBoundary = params(1:3); lt_y = computeBoundaryModel(ltBoundary, vehicleXPoints); % Visualize lane boundaries of the ego vehicle tform = get_tformToImage; % map vehicle to image coordinates ltPts = tform.transformPointsInverse([vehicleXPoints', lt_y']); rtPts = tform.transformPointsInverse([vehicleXPoints', rt_y']); laneFound = true; else laneFound = false; end end function yWorld = computeBoundaryModel(model, xWorld) yWorld = polyval(model, xWorld); end function tform = get_tformToImage % Compute extrinsics based on camera setup yaw = 0; pitch = 14; % pitch of the camera in degrees roll = 0; translation = translationVector(yaw, pitch, roll); rotation = rotationMatrix(yaw, pitch, roll); % Construct a camera matrix focalLength = [309.4362, 344.2161]; principalPoint = [318.9034, 257.5352]; Skew = 0; camMatrix = [rotation; translation] * intrinsicMatrix(focalLength, ... Skew, principalPoint); % Turn camMatrix into 2-D homography tform2D = [camMatrix(1,:); camMatrix(2,:); camMatrix(4,:)]; % drop Z tform = projective2d(tform2D); tform = tform.invert(); end function translation = translationVector(yaw, pitch, roll) SensorLocation = [0 0]; Height = 2.1798; % mounting height in meters from the ground rotationMatrix = (... rotZ(yaw)*... % last rotation rotX(90-pitch)*... rotZ(roll)... % first rotation ); % Adjust for the SensorLocation by adding a translation sl = SensorLocation; translationInWorldUnits = [sl(2), sl(1), Height]; translation = translationInWorldUnits*rotationMatrix; end %------------------------------------------------------------------ % Rotation around X-axis function R = rotX(a) a = deg2rad(a); R = [... 1 0 0; 0 cos(a) -sin(a); 0 sin(a) cos(a)]; end %------------------------------------------------------------------ % Rotation around Y-axis function R = rotY(a) a = deg2rad(a); R = [... cos(a) 0 sin(a); 0 1 0; -sin(a) 0 cos(a)]; end %------------------------------------------------------------------ % Rotation around Z-axis function R = rotZ(a) a = deg2rad(a); R = [... cos(a) -sin(a) 0; sin(a) cos(a) 0; 0 0 1]; end %------------------------------------------------------------------ % Given the Yaw, Pitch and Roll, figure out appropriate Euler % angles and the sequence in which they are applied in order to % align the camera's coordinate system with the vehicle coordinate % system. The resulting matrix is a Rotation matrix that together % with Translation vector define the camera extrinsic parameters. function rotation = rotationMatrix(yaw, pitch, roll) rotation = (... rotY(180)*... % last rotation: point Z up rotZ(-90)*... % X-Y swap rotZ(yaw)*... % point the camera forward rotX(90-pitch)*... % "un-pitch" rotZ(roll)... % 1st rotation: "un-roll" ); end function intrinsicMat = intrinsicMatrix(FocalLength, Skew, PrincipalPoint) intrinsicMat = ... [FocalLength(1) , 0 , 0; ... Skew , FocalLength(2) , 0; ... PrincipalPoint(1), PrincipalPoint(2), 1]; end
The network computes parameters a, b, and c describing the parabolic equation for the left and right lane boundaries.
From these parameters, we can compute the x and y coordinates corresponding to the lane positions. Furthermore, the coordinates must be mapped to image coordinates. The function detect_lane.m
does all these computations. We can generate CUDA code for this function as well.
cfg = coder.gpuConfig('lib'); cfg.DeepLearningConfig = coder.DeepLearningConfig('cudnn'); cfg.GenerateReport = true; cfg.TargetLang = 'C++'; codegen -args {ones(227,227,3,'single'),ones(1,6,'double'),ones(1,6,'double')} -config cfg detect_lane
Code generation successful: To view the report, open('codegen/lib/detect_lane/html/report.mldatx').
The SeriesNetwork is generated as a C++ class containing an array of 23 layer classes.
class c_lanenet
{
public:
int32_T batchSize;
int32_T numLayers;
real32_T *inputData;
real32_T *outputData;
MWCNNLayer *layers[23];
public:
c_lanenet(void);
void setup(void);
void predict(void);
void cleanup(void);
~c_lanenet(void);
};
The setup() method of the class sets up handles and allocates memory for each layer object. The predict() method invokes prediction for each of the 23 layers in the network.
The files cnn_lanenet_conv*_w and cnn_lanenet_conv*_w are the binary weights and bias file for convolution layer in the network. The files cnn_lanenet_fc*_w and cnn_lanenet_fc*_b are the binary weights and bias file for fully connected layer in the network.
codegendir = fullfile('codegen', 'lib', 'detect_lane'); dir(codegendir)
. cnn_lanenet_conv5_b .. cnn_lanenet_conv5_w DeepLearningNetwork.cu cnn_lanenet_fc6_b DeepLearningNetwork.h cnn_lanenet_fc6_w DeepLearningNetwork.o cnn_lanenet_fcLane1_b MWCNNLayerImpl.cu cnn_lanenet_fcLane1_w MWCNNLayerImpl.hpp cnn_lanenet_fcLane2_b MWCNNLayerImpl.o cnn_lanenet_fcLane2_w MWCudaDimUtility.cu cnn_lanenet_responseNames.txt MWCudaDimUtility.h codeInfo.mat MWCudaDimUtility.o detect_lane.a MWFusedConvReLULayer.cpp detect_lane.cu MWFusedConvReLULayer.hpp detect_lane.h MWFusedConvReLULayer.o detect_lane.o MWFusedConvReLULayerImpl.cu detect_lane_initialize.cu MWFusedConvReLULayerImpl.hpp detect_lane_initialize.h MWFusedConvReLULayerImpl.o detect_lane_initialize.o MWTargetNetworkImpl.cu detect_lane_ref.rsp MWTargetNetworkImpl.hpp detect_lane_rtw.mk MWTargetNetworkImpl.o detect_lane_terminate.cu buildInfo.mat detect_lane_terminate.h cnn_api.cpp detect_lane_terminate.o cnn_api.hpp detect_lane_types.h cnn_api.o examples cnn_lanenet_avg gpu_codegen_info.mat cnn_lanenet_conv1_b html cnn_lanenet_conv1_w interface cnn_lanenet_conv2_b predict.cu cnn_lanenet_conv2_w predict.h cnn_lanenet_conv3_b predict.o cnn_lanenet_conv3_w rtw_proj.tmw cnn_lanenet_conv4_b rtwtypes.h cnn_lanenet_conv4_w
% export mean and std values from the trained network for use in testing codegendir = fullfile(pwd, 'codegen', 'lib','detect_lane'); fid = fopen(fullfile(codegendir,'mean.bin'), 'w'); A = [coeffMeans coeffStds]; fwrite(fid, A, 'double'); fclose(fid);
The network code is to be compiled with a main file.
The main file uses the OpenCV VideoCapture
method to read frames from the input video. Each frame is processed and classified, until no more frames are to be read. Before displaying the output for each frame, the outputs are post-processed using the detect_lane function generated in detect_lane.cpp.
type main_lanenet.cpp
/* Copyright 2016 The MathWorks, Inc. */ #include <stdio.h> #include <stdlib.h> #include <cuda.h> #include "opencv2/opencv.hpp" #include <list> #include <cmath> #include "detect_lane.h" using namespace cv; void readData(float *input, Mat& orig, Mat & im) { Size size(227,227); resize(orig,im,size,0,0,INTER_LINEAR); for(int j=0;j<227*227;j++) { //BGR to RGB input[2*227*227+j]=(float)(im.data[j*3+0]); input[1*227*227+j]=(float)(im.data[j*3+1]); input[0*227*227+j]=(float)(im.data[j*3+2]); } } void addLane(float pts[28][2], Mat & im, int numPts) { std::vector<Point2f> iArray; for(int k=0; k<numPts; k++) { iArray.push_back(Point2f(pts[k][0],pts[k][1])); } Mat curve(iArray, true); curve.convertTo(curve, CV_32S); //adapt type for polylines polylines(im, curve, false, CV_RGB(255,255,0), 2, CV_AA); } void writeData(float *outputBuffer, Mat & im, int N, double means[6], double stds[6]) { // get lane coordinates boolean_T laneFound = 0; float ltPts[56]; float rtPts[56]; detect_lane(outputBuffer, means, stds, &laneFound, ltPts, rtPts); if (!laneFound) { return; } float ltPtsM[28][2]; float rtPtsM[28][2]; for(int k=0; k<28; k++) { ltPtsM[k][0] = ltPts[k]; ltPtsM[k][1] = ltPts[k+28]; rtPtsM[k][0] = rtPts[k]; rtPtsM[k][1] = rtPts[k+28]; } addLane(ltPtsM, im, 28); addLane(rtPtsM, im, 28); } void readMeanAndStds(const char* filename, double means[6], double stds[6]) { FILE* pFile = fopen(filename, "rb"); if (pFile==NULL) { fputs ("File error",stderr); return; } // obtain file size fseek (pFile , 0 , SEEK_END); long lSize = ftell(pFile); rewind(pFile); double* buffer = (double*)malloc(lSize); size_t result = fread(buffer,sizeof(double),lSize,pFile); if (result*sizeof(double) != lSize) { fputs ("Reading error",stderr); return; } for (int k = 0 ; k < 6; k++) { means[k] = buffer[k]; stds[k] = buffer[k+6]; } free(buffer); } // Main function int main(int argc, char* argv[]) { float *inputBuffer = (float*)calloc(sizeof(float),227*227*3); float *outputBuffer = (float*)calloc(sizeof(float),6); if ((inputBuffer == NULL) || (outputBuffer == NULL)) { printf("ERROR: Input/Output buffers could not be allocated!\n"); exit(-1); } // get ground truth mean and std double means[6]; double stds[6]; readMeanAndStds("mean.bin", means, stds); if (argc < 2) { printf("Pass in input video file name as argument\n"); return -1; } VideoCapture cap(argv[1]); if (!cap.isOpened()) { printf("Could not open the video capture device.\n"); return -1; } cudaEvent_t start, stop; float fps = 0; cudaEventCreate(&start); cudaEventCreate(&stop); Mat orig, im; namedWindow("Lane detection demo",CV_WINDOW_NORMAL); while(true) { cudaEventRecord(start); cap >> orig; if (orig.empty()) break; readData(inputBuffer, orig, im); writeData(inputBuffer, orig, 6, means, stds); cudaEventRecord(stop); cudaEventSynchronize(stop); char strbuf[50]; float milliseconds = -1.0; cudaEventElapsedTime(&milliseconds, start, stop); fps = fps*.9+1000.0/milliseconds*.1; sprintf (strbuf, "%.2f FPS", fps); putText(orig, strbuf, cvPoint(200,30), CV_FONT_HERSHEY_DUPLEX, 1, CV_RGB(0,0,0), 2); imshow("Lane detection demo", orig); if( waitKey(50)%256 == 27 ) break; // stop capturing by pressing ESC */ } destroyWindow("Lane detection demo"); free(inputBuffer); free(outputBuffer); return 0; }
if ~exist('./caltech_cordova1.avi', 'file') url = 'https://www.mathworks.com/supportfiles/gpucoder/media/caltech_cordova1.avi'; websave('caltech_cordova1.avi', url); end
if ispc setenv('MATLAB_ROOT', matlabroot); vspath = mex.getCompilerConfigurations('C++').Location; setenv('VSPATH', vspath); system('make_win.bat'); cd(codegendir); system('lanenet.exe ..\..\..\caltech_cordova1.avi'); else setenv('MATLAB_ROOT', matlabroot); system('make -f Makefile.mk'); cd(codegendir); system('./lanenet ../../../caltech_cordova1.avi'); end