Here is a possible workaround. Instead of learning the next state, one can learn the difference to the next state.
%% - cleanup
clear;
close all;
clc;
%% - data
t = linspace(0, 5, 1000);
odefcn = @(t, x) [x(2, :); 10*sin(x(1, :))-x(2, :)];
x0 = [pi/2, 0]';
[~, x] = ode45(odefcn, t, x0);
x = x';
X = x(:, 1:end-1);
Y = x(:, 2:end) - x(:, 1:end-1);
%% - define and train lstm network
numFeatures = 2;
numResponses = 2;
numHiddenUnits = 200;
layers = [sequenceInputLayer(numFeatures, 'Normalization', 'zscore');
lstmLayer(numHiddenUnits);
fullyConnectedLayer(numResponses);
regressionLayer];
opts = trainingOptions('adam', 'MaxEpochs', 100, 'Plots', 'training-progress');
net = trainNetwork(X, Y, layers, opts);
%% - prediction
xpred = x0;
for i = 1 : length(t)-1
[net, dxpred] = predictAndUpdateState(net, xpred(:, i));
xpred(:, i+1) = xpred(:, i) + dxpred;
end
%% - plotting
figure(1);
plot(t, x);
hold on;
grid on;
plot(t, xpred, '--');