我再使用强化学习工具箱编写SAC智能体进行训练时策略一直在上下限波动,没有很好的探索,而使用DDPG智能体和PPO智能体则是能够进行一些有效的探索,请问这是什么原因?
    11 views (last 30 days)
  
       Show older comments
    
%main
% 观测空间和动作空间定义
 % numObs = 11; %观测空间维度
 % numAct = 4;%动作空间维度
numObs1 = 7; %观测空间维度
numAct1 = 3;%动作空间维度
%BS, EB, ,CL
actLowerLimit = [-100 ;-200 ; -50];
actUpperLimit = [100 ; 200  ; 50];
obsInfo = rlNumericSpec([numObs1 1]);
obsInfo.Name = 'ObservationSac1';
% 连续动作空间
actInfo = rlNumericSpec([numAct1 1],...
              'LowerLimit',actLowerLimit,...
              'UpperLimit',actUpperLimit);            
actInfo.Name = 'ActionSac';
% 创建强化学习环境
env = rlFunctionEnv(obsInfo, actInfo, 'sacStepFunction', 'sacResetFunction');
% 定义网络结构参数
criticLayerSizes = [64 32];
actorLayerSizes = [64 32];
% 第一个Critic网络,SAC网络是同时输入观测状态与动作,然后输出一个Q值,每个critic网络有两个输入层;
% 观测状态输入层
obsPath = [
    featureInputLayer(numObs1, Name="obsPathInLyr")
    fullyConnectedLayer(criticLayerSizes(1))
    reluLayer
    fullyConnectedLayer(criticLayerSizes(1),Name="obsout")
    ];
% 动作状态输入层
actPath = [
    featureInputLayer(numAct1, Name="actPathInLyr")
    fullyConnectedLayer(criticLayerSizes(1))
    reluLayer
    fullyConnectedLayer(criticLayerSizes(1),Name="actout")
    ];
% 合并路径
comPath = [
    concatenationLayer(1,2,Name="cct")
    fullyConnectedLayer(criticLayerSizes(2))
    reluLayer    
    fullyConnectedLayer(1, Name="output")
    ];
%创建critic网络
criticNetwork = dlnetwork();
criticNetwork = addLayers(criticNetwork,obsPath);
criticNetwork = addLayers(criticNetwork,actPath); 
criticNetwork = addLayers(criticNetwork,comPath);
criticNetwork = connectLayers(criticNetwork,"obsout","cct/in1");
criticNetwork = connectLayers(criticNetwork,"actout","cct/in2");
critic11 = rlQValueFunction(criticNetwork,obsInfo,actInfo, ...
    ActionInputNames="actPathInLyr", ...
    ObservationInputNames="obsPathInLyr");
critic12 = rlQValueFunction(criticNetwork,obsInfo,actInfo, ...
    ActionInputNames="actPathInLyr", ...
    ObservationInputNames="obsPathInLyr");
% 创建策略网络(Actor Network),SAC的策略网络输入为观测状态,输出为动作的均值与方差,网络结构为单一输入两输出
% 输入层
inPath = [ 
    featureInputLayer( ...
        numObs1, ...
        Name="netOin")
        reluLayer
    fullyConnectedLayer( ...
        actorLayerSizes(1), ...
        Name="nethid")
         reluLayer
    fullyConnectedLayer( ...
        actorLayerSizes(2), ...
        Name="infc") 
    ];
meanPath = [ 
    % tanhLayer(Name="tanhMean");
    fullyConnectedLayer(numAct1,Name="FCMean");
    % scalingLayer(Name="scale", ...
    % Scale=actUpperLimit),
    ];
sdevPath = [ 
    reluLayer(Name="reluStdv");
    fullyConnectedLayer(numAct1,Name="FCStdv");
    softplusLayer(Name="splus") 
    ];
actorNetwork = dlnetwork();
actorNetwork = addLayers(actorNetwork,inPath);
actorNetwork = addLayers(actorNetwork,meanPath);
actorNetwork = addLayers(actorNetwork,sdevPath);
% actorNetwork = connectLayers(actorNetwork,"infc","tanhMean/in");
% actorNetwork = connectLayers(actorNetwork,"infc","tanhStdv/in");
actorNetwork = connectLayers(actorNetwork,"infc","FCMean/in");
actorNetwork = connectLayers(actorNetwork,"infc","reluStdv/in");
% 创建随机策略表示(高斯采样)
actor1 = rlContinuousGaussianActor(actorNetwork, obsInfo, actInfo, ...
    ActionMeanOutputNames="FCMean",...
    ActionStandardDeviationOutputNames="splus",...
    ObservationInputNames="netOin");
act = getAction(actor1,{rand(obsInfo.Dimension)}); 
dist = evaluate(actor1,{rand(obsInfo.Dimension)});
% 评估网络训练设置
criticOptions = rlOptimizerOptions( ...
    Optimizer="adam", ...
    LearnRate=1e-3,... 
    GradientThreshold=1, ...
    L2RegularizationFactor=2e-4);
%动作网络训练设置
actorOptions = rlOptimizerOptions( ...
    Optimizer="adam", ...
    LearnRate=1e-3,...
    GradientThreshold=1, ...
    L2RegularizationFactor=1e-5);
% 定义SAC智能体选项
sacOptions = rlSACAgentOptions(...
    'TargetSmoothFactor',1e-3,...  % 目标网络平滑系数
    'ExperienceBufferLength',5000,...  % 经验缓冲区大小
    'MiniBatchSize',256,...  % 小批量大小
    'DiscountFactor',0.99,...  % 折扣因子
    'SampleTime',1,...  % 采样时间
    'CriticOptimizerOptions', criticOptions,...
    'ActorOptimizerOptions',actorOptions);
% 创建SAC智能体
agent1 = rlSACAgent(actor1,[critic11 critic12],sacOptions);
% 定义训练选项 
trainOpts = rlTrainingOptions(...
    'MaxEpisodes',500,...  % 最大训练回合数
    'MaxStepsPerEpisode',96,...  % 每回合的最大步数
    'Verbose',true,...  % 不显示详细的训练信息
    'Plots','training-progress',...  % 显示训练进度图
    'StopTrainingCriteria','AverageReward',...  % 训练停止条件
    'StopTrainingValue',0,...  % 停止训练的平均奖励值
    'ScoreAveragingWindowLength',10,...  % 计算平均奖励的窗口长度
    'SaveAgentCriteria',"EpisodeReward",...  % 保存智能体的条件
    'SaveAgentValue',0);  % 保存智能体的奖励值
% %单智能体训练选项
% trainOpts = rlTrainingOptions(...
%     Plots='training-progress',...
%     MaxEpisodes=500,...
%     MaxStepsPerEpisode=96,...
%     ScoreAveragingWindowLength=10,...
%     StopTrainingCriteria="AverageReward", ...
%     StopTrainingValue=0); 
%     %"LearningStrategy","decentralized",...
%     % 'Verbose',true,  ...
% %训练智能体
result = train(agent1,env,trainOpts);
%% 测试
agent=agent_Trained;
%agent=agent1;
simSteps = 200;
simOptions = rlSimulationOptions('MaxSteps',simSteps);
experience = sim(env,agent,simOptions);
simActionSeries = experience.Action.ActionSac.Data;
%STEPFUNCTION
function [NextObs,Reward,IsDone,LoggedSignals] = sacStepFunction(Action,LoggedSignals)
Q_BS=500;
%读取三个综合能源系统分别一天内的电负荷需求
load('load_e.mat');
LOAD_EE = 0.6*transpose(load_e);
LOAD_EE1 = LOAD_EE(1:96);
LOAD_EE2 = LOAD_EE(97:192);
LOAD_EE3 = LOAD_EE(193:288);
%分时购电电价,扩展至96点
Power_B1=[0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.77 0.77 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 0.77 0.77 0.77  0.77 0.77];
Power_Buy=zeros(1,96);
for n=1:24
    Power_Buy(4*n-3:4*n)=Power_B1(n);
end
%光伏出力,每15分钟
load('PV.mat');
PV1_96 = transpose(PV(:,1));
PV2_96 = transpose(PV(:,2));
PV3_96 = transpose(PV(:,3));
%%EB与CHP
load('PSO_data.mat')
CHP1 = PSO_data(:,1);
CHP2 = PSO_data(:,2);
CHP3 = PSO_data(:,3);
EB1 = PSO_data(:,4);
EB2 = PSO_data(:,5);
EB3 = PSO_data(:,6);
%指令时延(s),96*3;
delay11 = 2*ones(96,1);
delay12 = 2*ones(96,1);
delay13 = 2*ones(96,1);
%BS, EB, BUY ,CL,TR
%action_space = rlNumericSpec([5 1], 'LowerLimit', action_lowerlimits, 'UpperLimit', action_upperlimits);
% 当前状态值, T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3
State = LoggedSignals.State;
%调试信息
% disp(class(Action));
% disp(size(Action));
 disp(Action);
% 逐个智能体状态更新
NextObs = zeros(size(State));
Reward = 0;
for agent_i = 1:1
    T_solt = State(1,agent_i);
    LOAD_E = State(2,agent_i);
    Power_buy = State(3,agent_i);
    PV = State(4,agent_i);
    CHP = State(5,agent_i);
    EB = State(6,agent_i);
    delay = State(7,agent_i);
    T_next = T_solt + 1;
    LOAD_E_next = LOAD_EE1(T_next);
    Power_buy_next = Power_Buy(T_next);
    PV_next = PV1_96(T_next);
    CHP_next = CHP1(T_next);
    EB_next = EB1(T_next);
    delay_next = delay11(T_next);
    % 更新状态
    NextObs(:, agent_i) = [T_next; LOAD_E_next; Power_buy_next; PV_next; CHP_next; EB_next; delay_next];
    % 奖励值
    %调控成本
    % [BS EB CL]
    LOAD_real= PV(1) + 0.9 * CHP -(EB/0.95 - Action(2)) + Action(3) + Action(1);
    %平衡项
    BUY = LOAD_E(1) - LOAD_real;
    COST = (0.5 * abs(Action(1)) + 0.2 * abs(Action(2)) + Power_buy * BUY + 0.5 * abs(Action(3)) );
    % 时延偏差成本
    % if BUY<=500 && BUY>0
    %     Penalty_local = 0.5*BUY;
    % elseif BUY>500 && BUY<=1000
    %     Penalty_local = 2*BUY;
    % else
    %     Penalty_local = 5*BUY;
    % end
    %Penalty_local = LOAD_E(1) - PV(1) + Action(1) + Action(2) - Action(3) -  Action(4) - 0.8 * CHP + EB / 0.95;%正的是用电
    % if abs(Penalty_local)<=80
    %     Penalty_local = 1*abs(Penalty_local);
    % elseif abs(Penalty_local)<=120
    %     Penalty_local = 2*abs(Penalty_local);
    % elseif abs(Penalty_local)>120
    %     Penalty_local = 3*abs(Penalty_local);
    % else
    %     Penalty_local = 0;
    % end
    %Penalty_local
    % %全局功率平衡约束
    % Penalty_global = 100*(sum(LOAD_E) -sum(PV) + sum(Action(1)) + sum(Action(2)) - sum(Action(3)) - sum(Action(4)) - sum(Action(5))) ;
    % if Penalty_global>300 || Penalty_local<-300
    % Penalty_global=100;
    % else
    % Penalty_global = 0;
    % end
    %功率交互约束
    % Penalty_Pt = 100*sum(Action(5,:));
    % Reward = -COST- Penalty_local;
     Reward = -COST;
end
LoggedSignals.State = NextObs;
LoggedSignals.action=Action;
NextObs = mat2cell(NextObs, 7, 1);
%判断一轮学习是否结束
%T_next
IsDone=(T_next >= 96);
end
%RESET
%状态环境重置函数
function [InitialObservation, LoggedSignal] = sacResetFunction()%重置强化学习环境
%读取三个综合能源系统分别一天内的电负荷需求
load('load_e.mat');
LOAD_E = 0.6*transpose(load_e);
LOAD_E1 = LOAD_E(1:96);
LOAD_E2 = LOAD_E(97:192);
LOAD_E3 = LOAD_E(193:288);
%分时购电电价,扩展至96点
Power_B=[0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.77 0.77 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 0.77 0.77 0.77  0.77 0.77];
Power_buy=zeros(1,96);
for n=1:24
    Power_buy(4*n-3:4*n)=Power_B(n);
end
%光伏出力,每15分钟
load('PV.mat');
PV_1 = transpose(PV(:,1));
PV_2 = transpose(PV(:,2));
PV_3 = transpose(PV(:,3));
%%指令时延(s),96*3;
delay1 = 2*ones(96,1);
delay2 = 2*ones(96,1);
delay3 = 2*ones(96,1);
%%EP与CHP
load('PSO_data.mat')
CHP1 = PSO_data(:,1);
CHP2 = PSO_data(:,2);
CHP3 = PSO_data(:,3);
EB1 = PSO_data(:,4);
EB2 = PSO_data(:,5);
EB3 = PSO_data(:,6);
%初始化步骤
T_solt = 1;
LOAD_E1 = LOAD_E1(1);
LOAD_E2 = LOAD_E2(1);
LOAD_E3 = LOAD_E3(1);
Power_buy = Power_buy(1);
PV_1 = PV_1(1);
PV_2 = PV_2(1);
PV_3 = PV_3(1);
delay1 = delay1(1);
delay2 = delay2(1);
delay3 = delay3(1);
CHP1 = CHP1(1);
CHP2 = CHP2(1);
CHP3 = CHP3(1);
EB1=EB1(1);
EB2=EB2(1);
EB3=EB3(1);
 %重置三个智能体状态的初始化观测值
LoggedSignal.State(:,1) = [T_solt;LOAD_E1;Power_buy;PV_1;CHP1;EB1;delay1;];
% LoggedSignal.State(:,2) = [T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3];
% LoggedSignal.State(:,3) = [T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3];
 % LoggedSignal.State=[T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3];
% 将初始环境状态变量作为记录信号LoggedSignal返回
 % InitialObservation = {LoggedSignal.Agent1State, LoggedSignal.Agent2State, LoggedSignal.Agent3State};
 InitialObservation = {LoggedSignal.State(:,1)};
end
0 Comments
Answers (1)
See Also
Community Treasure Hunt
Find the treasures in MATLAB Central and discover how the community can help you!
Start Hunting!
