read unstructured data with readtable function

6 views (last 30 days)
I want to read the data in the attached file, using readtbale given below:
fname1='sample.txt';
opt = detectImportOptions(fname1, 'headerlines', 0);
data = readtable(fname1, opt);
However, the result is not correct because there is a shift of data structure. I plan to get following structure:
Is there any best way to do it? thank you for help

Accepted Answer

Kevin Holly
Kevin Holly on 21 Sep 2021
Edited: Kevin Holly on 21 Sep 2021
You can simply use the "Import Data" button on the toolstrip in MATLAB. A user interface will show up and before importing the data as a table, you can click a dropdown box by the "Import Selection" button and select "Generate Script" or "Generate Function" to generate code.
Below is code generated by MATLAB:
%% Read columns of data as text:
% For more information, see the TEXTSCAN documentation.
formatSpec = '%8s%3s%5s%8s%5s%5s%5s%5s%s%[^\n\r]';
%% Open the text file.
fileID = fopen(filename,'r');
%% Read columns of data according to the format.
% This call is based on the structure of the file used to generate this code. If an error occurs for a different file, try regenerating the code from the Import Tool.
dataArray = textscan(fileID, formatSpec, 'Delimiter', '', 'WhiteSpace', '', 'TextType', 'string', 'ReturnOnError', false);
%% Close the text file.
fclose(fileID);
%% Convert the contents of columns containing numeric text to numbers.
% Replace non-numeric text with NaN.
raw = repmat({''},length(dataArray{1}),length(dataArray)-1);
for col=1:length(dataArray)-1
raw(1:length(dataArray{col}),col) = mat2cell(dataArray{col}, ones(length(dataArray{col}), 1));
end
numericData = NaN(size(dataArray{1},1),size(dataArray,2));
for col=[3,4,5,6,7,8,9]
% Converts text in the input cell array to numbers. Replaced non-numeric text with NaN.
rawData = dataArray{col};
for row=1:size(rawData, 1)
% Create a regular expression to detect and remove non-numeric prefixes and suffixes.
regexstr = '(?<prefix>.*?)(?<numbers>([-]*(\d+[\,]*)+[\.]{0,1}\d*[eEdD]{0,1}[-+]*\d*[i]{0,1})|([-]*(\d+[\,]*)*[\.]{1,1}\d+[eEdD]{0,1}[-+]*\d*[i]{0,1}))(?<suffix>.*)';
try
result = regexp(rawData(row), regexstr, 'names');
numbers = result.numbers;
% Detected commas in non-thousand locations.
invalidThousandsSeparator = false;
if numbers.contains(',')
thousandsRegExp = '^[-/+]*\d+?(\,\d{3})*\.{0,1}\d*$';
if isempty(regexp(numbers, thousandsRegExp, 'once'))
numbers = NaN;
invalidThousandsSeparator = true;
end
end
% Convert numeric text to numbers.
if ~invalidThousandsSeparator
numbers = textscan(char(strrep(numbers, ',', '')), '%f');
numericData(row, col) = numbers{1};
raw{row, col} = numbers{1};
end
catch
raw{row, col} = rawData{row};
end
end
end
% Convert the contents of columns with dates to MATLAB datetimes using the specified date format.
try
dates{1} = datetime(dataArray{1}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss');
catch
try
% Handle dates surrounded by quotes
dataArray{1} = cellfun(@(x) x(2:end-1), dataArray{1}, 'UniformOutput', false);
dates{1} = datetime(dataArray{1}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss');
catch
dates{1} = repmat(datetime([NaN NaN NaN]), size(dataArray{1}));
end
end
dates = dates(:,1);
%% Split data into numeric and string columns.
rawNumericColumns = raw(:, [3,4,5,6,7,8,9]);
rawStringColumns = string(raw(:, 2));
%% Make sure any text containing <undefined> is properly converted to an <undefined> categorical
idx = (rawStringColumns(:, 1) == "<undefined>");
rawStringColumns(idx, 1) = "";
%% Create output variable
test = table;
test.VarName1 = dates{:, 1};
test.VarName2 = categorical(rawStringColumns(:, 1));
test.VarName3 = cell2mat(rawNumericColumns(:, 1));
test.VarName4 = cell2mat(rawNumericColumns(:, 2));
test.VarName5 = cell2mat(rawNumericColumns(:, 3));
test.VarName6 = cell2mat(rawNumericColumns(:, 4));
test.VarName7 = cell2mat(rawNumericColumns(:, 5));
test.VarName8 = cell2mat(rawNumericColumns(:, 6));
test.VarName9 = cell2mat(rawNumericColumns(:, 7));
% For code requiring serial dates (datenum) instead of datetime, uncomment the following line(s) below to return the imported dates as datenum(s).
% test.VarName1=datenum(test.VarName1);
%% Clear temporary variables
clearvars filename formatSpec fileID dataArray ans raw col numericData rawData row regexstr result numbers invalidThousandsSeparator thousandsRegExp dates blankDates anyBlankDates invalidDates anyInvalidDates rawNumericColumns rawStringColumns idx;
This is what "Generate Function" creates:
function test1 = importfile(filename, startRow, endRow)
%IMPORTFILE Import numeric data from a text file as a matrix.
% TEST1 = IMPORTFILE(FILENAME)
% Reads data from text file FILENAME for the default selection.
%
% TEST1 = IMPORTFILE(FILENAME, STARTROW, ENDROW)
% Reads data from rows STARTROW through ENDROW of text file FILENAME.
%
% Example:
% test1 = importfile('test.txt', 1, 16);
%
% See also TEXTSCAN.
% Auto-generated by MATLAB on 2021/09/21 10:57:33
%% Initialize variables.
if nargin<=2
startRow = 1;
endRow = inf;
end
%% Read columns of data as text:
% For more information, see the TEXTSCAN documentation.
formatSpec = '%8s%3s%5s%8s%5s%5s%5s%5s%s%[^\n\r]';
%% Open the text file.
fileID = fopen(filename,'r');
%% Read columns of data according to the format.
% This call is based on the structure of the file used to generate this code. If an error occurs for a different file, try regenerating the code from the Import Tool.
dataArray = textscan(fileID, formatSpec, endRow(1)-startRow(1)+1, 'Delimiter', '', 'WhiteSpace', '', 'TextType', 'string', 'HeaderLines', startRow(1)-1, 'ReturnOnError', false, 'EndOfLine', '\r\n');
for block=2:length(startRow)
frewind(fileID);
dataArrayBlock = textscan(fileID, formatSpec, endRow(block)-startRow(block)+1, 'Delimiter', '', 'WhiteSpace', '', 'TextType', 'string', 'HeaderLines', startRow(block)-1, 'ReturnOnError', false, 'EndOfLine', '\r\n');
for col=1:length(dataArray)
dataArray{col} = [dataArray{col};dataArrayBlock{col}];
end
end
%% Close the text file.
fclose(fileID);
%% Convert the contents of columns containing numeric text to numbers.
% Replace non-numeric text with NaN.
raw = repmat({''},length(dataArray{1}),length(dataArray)-1);
for col=1:length(dataArray)-1
raw(1:length(dataArray{col}),col) = mat2cell(dataArray{col}, ones(length(dataArray{col}), 1));
end
numericData = NaN(size(dataArray{1},1),size(dataArray,2));
for col=[3,4,5,6,7,8,9]
% Converts text in the input cell array to numbers. Replaced non-numeric text with NaN.
rawData = dataArray{col};
for row=1:size(rawData, 1)
% Create a regular expression to detect and remove non-numeric prefixes and suffixes.
regexstr = '(?<prefix>.*?)(?<numbers>([-]*(\d+[\,]*)+[\.]{0,1}\d*[eEdD]{0,1}[-+]*\d*[i]{0,1})|([-]*(\d+[\,]*)*[\.]{1,1}\d+[eEdD]{0,1}[-+]*\d*[i]{0,1}))(?<suffix>.*)';
try
result = regexp(rawData(row), regexstr, 'names');
numbers = result.numbers;
% Detected commas in non-thousand locations.
invalidThousandsSeparator = false;
if numbers.contains(',')
thousandsRegExp = '^[-/+]*\d+?(\,\d{3})*\.{0,1}\d*$';
if isempty(regexp(numbers, thousandsRegExp, 'once'))
numbers = NaN;
invalidThousandsSeparator = true;
end
end
% Convert numeric text to numbers.
if ~invalidThousandsSeparator
numbers = textscan(char(strrep(numbers, ',', '')), '%f');
numericData(row, col) = numbers{1};
raw{row, col} = numbers{1};
end
catch
raw{row, col} = rawData{row};
end
end
end
% Convert the contents of columns with dates to MATLAB datetimes using the specified date format.
try
dates{1} = datetime(dataArray{1}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss');
catch
try
% Handle dates surrounded by quotes
dataArray{1} = cellfun(@(x) x(2:end-1), dataArray{1}, 'UniformOutput', false);
dates{1} = datetime(dataArray{1}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss');
catch
dates{1} = repmat(datetime([NaN NaN NaN]), size(dataArray{1}));
end
end
dates = dates(:,1);
%% Split data into numeric and string columns.
rawNumericColumns = raw(:, [3,4,5,6,7,8,9]);
rawStringColumns = string(raw(:, 2));
%% Make sure any text containing <undefined> is properly converted to an <undefined> categorical
idx = (rawStringColumns(:, 1) == "<undefined>");
rawStringColumns(idx, 1) = "";
%% Create output variable
test1 = table;
test1.VarName1 = dates{:, 1};
test1.VarName2 = categorical(rawStringColumns(:, 1));
test1.VarName3 = cell2mat(rawNumericColumns(:, 1));
test1.VarName4 = cell2mat(rawNumericColumns(:, 2));
test1.VarName5 = cell2mat(rawNumericColumns(:, 3));
test1.VarName6 = cell2mat(rawNumericColumns(:, 4));
test1.VarName7 = cell2mat(rawNumericColumns(:, 5));
test1.VarName8 = cell2mat(rawNumericColumns(:, 6));
test1.VarName9 = cell2mat(rawNumericColumns(:, 7));
% For code requiring serial dates (datenum) instead of datetime, uncomment the following line(s) below to return the imported dates as datenum(s).
% test1.VarName1=datenum(test1.VarName1);
  3 Comments
Kevin Holly
Kevin Holly on 21 Sep 2021
Edited: Kevin Holly on 21 Sep 2021
Maruki,
I imported the data shown above with the previous data and generated the following functon:
function test3 = importfile(filename, startRow, endRow)
%IMPORTFILE Import numeric data from a text file as a matrix.
% TEST3 = IMPORTFILE(FILENAME)
% Reads data from text file FILENAME for the default selection.
%
% TEST3 = IMPORTFILE(FILENAME, STARTROW, ENDROW)
% Reads data from rows STARTROW through ENDROW of text file FILENAME.
%
% Example:
% test3 = importfile('test3.txt', 1, 27);
%
% See also TEXTSCAN.
% Auto-generated by MATLAB on 2021/09/21 11:44:28
%% Initialize variables.
if nargin<=2
startRow = 1;
endRow = inf;
end
%% Read columns of data as text:
% For more information, see the TEXTSCAN documentation.
formatSpec = '%8s%3s%5s%8s%5s%5s%5s%5s%s%[^\n\r]';
%% Open the text file.
fileID = fopen(filename,'r');
%% Read columns of data according to the format.
% This call is based on the structure of the file used to generate this code. If an error occurs for a different file, try regenerating the code from the Import Tool.
dataArray = textscan(fileID, formatSpec, endRow(1)-startRow(1)+1, 'Delimiter', '', 'WhiteSpace', '', 'TextType', 'string', 'HeaderLines', startRow(1)-1, 'ReturnOnError', false, 'EndOfLine', '\r\n');
for block=2:length(startRow)
frewind(fileID);
dataArrayBlock = textscan(fileID, formatSpec, endRow(block)-startRow(block)+1, 'Delimiter', '', 'WhiteSpace', '', 'TextType', 'string', 'HeaderLines', startRow(block)-1, 'ReturnOnError', false, 'EndOfLine', '\r\n');
for col=1:length(dataArray)
dataArray{col} = [dataArray{col};dataArrayBlock{col}];
end
end
%% Close the text file.
fclose(fileID);
%% Convert the contents of columns containing numeric text to numbers.
% Replace non-numeric text with NaN.
raw = repmat({''},length(dataArray{1}),length(dataArray)-1);
for col=1:length(dataArray)-1
raw(1:length(dataArray{col}),col) = mat2cell(dataArray{col}, ones(length(dataArray{col}), 1));
end
numericData = NaN(size(dataArray{1},1),size(dataArray,2));
for col=[3,4,5,6,7,8,9]
% Converts text in the input cell array to numbers. Replaced non-numeric text with NaN.
rawData = dataArray{col};
for row=1:size(rawData, 1)
% Create a regular expression to detect and remove non-numeric prefixes and suffixes.
regexstr = '(?<prefix>.*?)(?<numbers>([-]*(\d+[\,]*)+[\.]{0,1}\d*[eEdD]{0,1}[-+]*\d*[i]{0,1})|([-]*(\d+[\,]*)*[\.]{1,1}\d+[eEdD]{0,1}[-+]*\d*[i]{0,1}))(?<suffix>.*)';
try
result = regexp(rawData(row), regexstr, 'names');
numbers = result.numbers;
% Detected commas in non-thousand locations.
invalidThousandsSeparator = false;
if numbers.contains(',')
thousandsRegExp = '^[-/+]*\d+?(\,\d{3})*\.{0,1}\d*$';
if isempty(regexp(numbers, thousandsRegExp, 'once'))
numbers = NaN;
invalidThousandsSeparator = true;
end
end
% Convert numeric text to numbers.
if ~invalidThousandsSeparator
numbers = textscan(char(strrep(numbers, ',', '')), '%f');
numericData(row, col) = numbers{1};
raw{row, col} = numbers{1};
end
catch
raw{row, col} = rawData{row};
end
end
end
% Convert the contents of columns with dates to MATLAB datetimes using the specified date format.
try
dates{1} = datetime(dataArray{1}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss');
catch
try
% Handle dates surrounded by quotes
dataArray{1} = cellfun(@(x) x(2:end-1), dataArray{1}, 'UniformOutput', false);
dates{1} = datetime(dataArray{1}, 'Format', 'HH:mm:ss', 'InputFormat', 'HH:mm:ss');
catch
dates{1} = repmat(datetime([NaN NaN NaN]), size(dataArray{1}));
end
end
dates = dates(:,1);
%% Split data into numeric and string columns.
rawNumericColumns = raw(:, [3,4,5,6,7,8,9]);
rawStringColumns = string(raw(:, 2));
%% Replace non-numeric cells with NaN
R = cellfun(@(x) ~isnumeric(x) && ~islogical(x),rawNumericColumns); % Find non-numeric cells
rawNumericColumns(R) = {NaN}; % Replace non-numeric cells
%% Make sure any text containing <undefined> is properly converted to an <undefined> categorical
idx = (rawStringColumns(:, 1) == "<undefined>");
rawStringColumns(idx, 1) = "";
%% Create output variable
test3 = table;
test3.VarName1 = dates{:, 1};
test3.VarName2 = categorical(rawStringColumns(:, 1));
test3.VarName3 = cell2mat(rawNumericColumns(:, 1));
test3.VarName4 = cell2mat(rawNumericColumns(:, 2));
test3.VarName5 = cell2mat(rawNumericColumns(:, 3));
test3.VarName6 = cell2mat(rawNumericColumns(:, 4));
test3.VarName7 = cell2mat(rawNumericColumns(:, 5));
test3.VarName8 = cell2mat(rawNumericColumns(:, 6));
test3.VarName9 = cell2mat(rawNumericColumns(:, 7));
% For code requiring serial dates (datenum) instead of datetime, uncomment the following line(s) below to return the imported dates as datenum(s).
% test3.VarName1=datenum(test3.VarName1);
This should place NaN where values are missing.

Sign in to comment.

More Answers (1)

Walter Roberson
Walter Roberson on 21 Sep 2021
The table is fixed width. Use fixed width import options; https://www.mathworks.com/help/matlab/ref/matlab.io.text.fixedwidthimportoptions.html

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!