Training a DDPG agent using reinforcement learning toolbox: training performance is bad

Question

Fangyuan Chang on 15 Nov 2020

1
Link

Direct link to this question

https://se.mathworks.com/matlabcentral/answers/648223-training-a-ddpg-agent-using-reinforcement-learning-toolbox-training-performance-is-bad

Edited: Fangyuan Chang on 15 Nov 2020

Hey, there. I'm working on training a DDPG agent with matlab reinforcement learning toolbox, however the training performance is bad. Could anyone help with this?

My environment was modified based on an environment template from matlab documentation. The purpose of the environment is to calculate the circuit efficiency given different component parameters. The reward is higher for the circuit with higher efficiency. See below:

classdef conv_contin_formula < rl.env.MATLABEnvironment
   
    properties
        % Specify necessary global parameters    
        Vinput=48;
        Vout=5;
        Pout=20;
        Duty=5/48;       
        Rout = 5^2/20;         
    end
     
    properties
        % Initialize system state [frequency, Inductor, capacitor]'
        State = zeros(3,1)
    end
    
    properties(Access = protected)       
        IsDone = false        
    end
    methods              
        % Contructor method creates an instance of the environment
        function this = conv_contin_formula()
            % Initialize Observation settings           
            ObservationInfo = rlNumericSpec([3 1]);
            ObservationInfo.Name = 'States';
            ObservationInfo.Description = 'f, L, C';
            
            % Initialize Action settings               
            ActionInfo = rlNumericSpec([3 1],'LowerLimit',0.5,'UpperLimit',2);
            ActionInfo.Name = 'Converter Action';
            
            % The following line implements built-in functions of RL env
            this = this@rl.env.MATLABEnvironment(ObservationInfo,ActionInfo);
            
            % Initialize property values and pre-compute necessary values
            updateActionInfo(this);
        end
        
        % Apply system dynamics and calculate circuit efficiency      
        function [Observation,Reward,IsDone,LoggedSignals] = step(this,Action)
            LoggedSignals = [];
            
            % Get action: change the previous states by multiplying coefficients: S' = A * S
            % Action: the coefficients to multiply
            Act = getForce(this,Action);             
                  
            % Update system states
            Observation = this.State .* Act;
            this.State = Observation;
            
           % Check terminal condition
            X = Observation(1);
            Y = Observation(2);
            Z = Observation(3);
            IsDone = X > 50^10e4 || Y > 10e-4 || Z > 40*10e-4 || X < 10e4 || Y < 10e-6 || Z < 10e-4;
            this.IsDone = IsDone;
            
            % Get reward
            Reward = getReward(this,Observation);
            
            notifyEnvUpdated(this);           
        end
        
        % Reset environment to initial state and output initial observation
        function InitialObservation = reset(this)
            numbf = randi(50);  f0 = numbf*10e4;
            numbL = randi(100); L0 = numbL*10e-6;
            numbC = randi(40); C0 = numbC*10e-4;
            
            InitialObservation = [f0;L0;C0];
            this.State = InitialObservation;
            
            notifyEnvUpdated(this);        
        end
    end
    
    methods             
        function temp = getForce(~,Action)           
            temp = Action;            
        end
        
        % update the action info 
        function updateActionInfo(this)
            
        end
        
        % Reward function
        function Reward = getReward(this,Observation)           
            New_Tsw = 1/Observation(1);      
            New_Ind = Observation(2);
            New_Cap = Observation(3);
            New_Fsw = Observation(1);
       %----------------------- calculate power efficiency--------------------------
           Rdson=0.1; %MOSFET on resistance 0.1
           DCR= 0.08; %inductor direct current resistance 0.08
           tr=4*10^-9; %MOSFET rise time 4nsec
           tf=6*10^-9; %MOSFET fall time 6nsec
           Irr=0.3; %peak value if body diode reverse recovery current 0.3A
           trr=25*10^-9; %body diode reverse recovery time 25nsec
           %calculation of other parameters with given parameters
           D=this.Vout/this.Vinput; %duty cycle
           Io=this.Pout/this.Vout; %output current
           Iripple=this.Vout*(1-D)/New_Ind/New_Fsw; %inductor current ripple
           %conduction loss
           PonH=Rdson*(Io^2+Iripple^2/12)*D;      %conduction loss
           %switching loss
           PswH=0.5*this.Vinput*Io*(tr+tf)*New_Fsw;
           %conduction loss
           PonL=Rdson*(Io^2+Iripple^2/12)*(1-D);      %conduction loss
           Pdiode=0.5*this.Vinput*Irr*trr*New_Fsw;
           %conduction loss in the inductor
           PL=(Io^2+Iripple^2/12)*DCR;
           % total loss
           Ploss=PonH+PswH+PonL+Pdiode+PL;
           %Efficiency
           power_efficiency=this.Pout/(this.Pout+Ploss);
%--------------------------------------------------------------------------       
            Reward = power_efficiency;
%            
        end
        
        % (optional) Properties validation through set methods
        function set.State(this,state)
            validateattributes(state,{'numeric'},{'finite','real','vector','numel',3},'','State');
            this.State = double(state(:));
            notifyEnvUpdated(this);
        end
    end
    
    methods (Access = protected)
        % (optional) update visualization everytime the environment is updated        
        function envUpdatedCallback(this)
            plot(this.State)
            hold off           
            XLimMode = 'auto';
            YLimMode = 'auto';             
        end        
       
    end
end

The DDPG agent is set as follows:

env = conv_contin_formula;
obsInfo = getObservationInfo(env);
numObs = obsInfo.Dimension(1);
actInfo = getActionInfo(env);
numAct = actInfo.Dimension(1);
rng(0)
numLayer = 128;
numLayer2 = 64;
statePath = [
featureInputLayer(numObs,'Normalization','none','Name','observation')
fullyConnectedLayer(numLayer,'Name','CriticStateFC1')
reluLayer('Name','CriticRelu1')
fullyConnectedLayer(numLayer2,'Name','CriticStateFC2')];
actionPath = [
featureInputLayer(numAct,'Normalization','none','Name','action')
fullyConnectedLayer(numLayer2,'Name','CriticActionFC1','BiasLearnRateFactor',0)];
commonPath = [
additionLayer(2,'Name','add')
reluLayer('Name','CriticCommonRelu')
fullyConnectedLayer(1,'Name','CriticOutput')];
criticNetwork = layerGraph(statePath);
criticNetwork = addLayers(criticNetwork,actionPath);
criticNetwork = addLayers(criticNetwork,commonPath);
criticNetwork = connectLayers(criticNetwork,'CriticStateFC2','add/in1');
criticNetwork = connectLayers(criticNetwork,'CriticActionFC1','add/in2');
criticOptions = rlRepresentationOptions('LearnRate',0.01,'GradientThreshold',1);
critic = rlQValueRepresentation(criticNetwork,obsInfo,actInfo,...
'Observation',{'observation'},'Action',{'action'},criticOptions);
actorNetwork = [
    featureInputLayer(numObs,'Normalization','none','Name','observation')
    fullyConnectedLayer(numLayer,'Name','ActorFC1')
    reluLayer('Name','ActorRelu1')
    fullyConnectedLayer(numLayer2,'Name','ActorFC2')
    reluLayer('Name','ActorRelu2')
    fullyConnectedLayer(numAct,'Name','ActorFC3')
    tanhLayer('Name','ActorTanh')
    scalingLayer('Name','ActorScaling','Scale',max(actInfo.UpperLimit))];
actorOpts = rlRepresentationOptions('LearnRate',0.1,'GradientThreshold',1);
actor = rlDeterministicActorRepresentation(actorNetwork,obsInfo,actInfo,...
    'Observation',{'observation'},'Action',{'ActorScaling'},actorOpts);
agentOptions = rlDDPGAgentOptions(...
    'TargetSmoothFactor',1e-3,...
    'ExperienceBufferLength',1e6 ,...
    'DiscountFactor',0.2,...
    'MiniBatchSize',128);
agent = rlDDPGAgent(actor,critic,agentOptions);
maxepisodes = 10000;
maxsteps = 2;
trainingOptions = rlTrainingOptions(...
    'MaxEpisodes',maxepisodes,...
    'MaxStepsPerEpisode',maxsteps,...
    'StopOnError',"on",...
    'Verbose',false,...
    'Plots',"training-progress",...
    'StopTrainingCriteria',"AverageReward",...
    'StopTrainingValue',500,...
    'ScoreAveragingWindowLength',100,...
    'SaveAgentCriteria',"EpisodeReward",...
    'SaveAgentValue',500); 
trainingStats = train(agent,env,trainingOptions);
trained_critic=getCritic(agent);
trained_table = getLearnableParameters(trained_critic);

The training result is shown as follows. The performance is very bad. I'm not sure whether the issues are with the environment or the agent settings. Any suggestions? Thank you very much for your time!