gusucode.com > datatypes 工具箱matlab源码程序 > datatypes/@table/readTextFile.m

    function t = readTextFile(file, args)
%READFILE Read in a delimited text file and create a table.

%   Copyright 2012-2015 The MathWorks, Inc.
import matlab.internal.tableUtils.validateLogical

% Default to mixed line endings -- any line may end with \r, \n, or \r\n.
mixedEOL = sprintf('\r\n');
defaultwhitespace = sprintf(' \b\t');
nvPairs = { 'ReadVariableNames', true;
    'ReadRowNames',false;
    'Delimiter',',';
    'Format',[];
    'Whitespace',defaultwhitespace;
    'TreatAsEmpty', {};
    'HeaderLines', 0;
    'FileEncoding','';
    'DateLocale','';
    'EndOfLine',mixedEOL;
    'CommentStyle',{}
    'TextType','char';
    'DatetimeType','datetime'};

    [readVarNames,readRowNames,delimiter,format,whitespace,treatAsEmpty,headerlines, ...
        fileEncoding,locale,eol,commentstyle,texttype,datetimetype,supplied,otherArgs] = matlab.internal.table.parseArgs(nvPairs(:,1)', nvPairs(:,2)', args{:});

detected.Delimiter = ~supplied.Delimiter;
detected.HeaderLines = ~supplied.HeaderLines;
detected.Format = ~supplied.Format;
detected.ReadVariableNames = ~supplied.ReadVariableNames;

eol = sprintf(eol);
whitespace = sprintf(whitespace);

validatestring(texttype,{'char','string'});
validatestring(datetimetype,{'datetime','text'});

otherArgs = [otherArgs , {'TextType',texttype}];
readRowNames = validateLogical(readRowNames,'ReadRowNames');
readVarNames = validateLogical(readVarNames,'ReadVariableNames');

inferFormat = ~supplied.Format;

% Pass the locale along to textscan
if supplied.Format && supplied.DateLocale
    otherArgs{end+1} = 'DateLocale';
    otherArgs{end+1} = locale;
end

if supplied.CommentStyle
    otherArgs{end+1} = 'CommentStyle';
    otherArgs{end+1} = commentstyle;
end

if isempty(treatAsEmpty)
    treatAsEmpty = {};
elseif ischar(treatAsEmpty) && ~isrow(treatAsEmpty)
        % textscan does something a little obscure when treatAsEmpty is char but
        % not a row vector, disallow that here.
    error(message('MATLAB:readtable:InvalidTreatAsEmpty'));
elseif ischar(treatAsEmpty) || iscellstr(treatAsEmpty)
        % TreatAsEmpty is only ever applied to numeric fields in the file, and
        % textscan ignores leading/trailing whitespace for those fields, so trim
        % insignificant whitespace.
    treatAsEmpty = strtrim(treatAsEmpty);
    if any(~isnan(str2double(treatAsEmpty))) || any(strcmpi('nan',treatAsEmpty))
        error(message('MATLAB:readtable:NumericTreatAsEmpty'));
    end
else
    error(message('MATLAB:readtable:InvalidTreatAsEmpty'));
end

if ~isscalar(headerlines) || ~isnumeric(headerlines) || ...
        (headerlines < 0) || (round(headerlines) ~= headerlines)
    error(message('MATLAB:readtable:InvalidHeaderLines'));
end

if supplied.Delimiter
    % Check the delimiter, and convert 'space','comma', etc. into real delimiters.
    delimiter = validateDelims(delimiter);
end

% Check the parameters for valid values in textscan.
validateTextscanParameters(otherArgs);

% Open the file.
[fid,file] = openFile(file,fileEncoding);
try
    if ~supplied.Delimiter || ~supplied.HeaderLines || inferFormat
        
        detectIOargs = {'Encoding',fileEncoding,...
            'Whitespace',whitespace,...
            'LineEnding',eol,...
            'CommentStyle',commentstyle};
        
        if ~inferFormat
            fmt_str = matlab.iofun.internal.formatParser(format);
            n = nnz(~fmt_str.IsLiteral);
            detectIOargs(end+1:end+2) = {'NumVariables',n};
        end
        
        if supplied.Delimiter
            detectIOargs = [detectIOargs {'Delimiter',delimiter}];
        end
        if supplied.HeaderLines
            detectIOargs = [detectIOargs {'NumHeaderLines',headerlines}];
        end
        opts = detectImportOptions(file,'FileType','text',detectIOargs{:});
        
        delimiter = cellfun(@sprintf,opts.Delimiter,'UniformOutput',false);
        if opts.VariableNamesLine > 0
            if ~supplied.HeaderLines
                headerlines = opts.VariableNamesLine - 1;
                supplied.HeaderLines = true;
            end
            if ~supplied.ReadVariableNames
                readVarNames =  true;
                supplied.ReadVariableNames = true;
            end
        else
            if ~supplied.HeaderLines
                headerlines = opts.DataLine - 1;
                supplied.HeaderLines = true;
            end
            if ~supplied.ReadVariableNames
                readVarNames =  false;
                supplied.ReadVariableNames = true;
            end
        end
        
        % If using space aligned reading, pass in mdao true (unless it's being
        % overridden by a user input; This is why it's added to the front.)
        if strcmp(opts.ConsecutiveDelimitersRule,'join')
            otherArgs = [{'MultipleDelimsAsOne', true}, otherArgs];
        end
    else
        opts = matlab.io.text.DelimitedTextImportOptions('Delimiter',delimiter,...
            'Whitespace',whitespace,...
            'LineEnding',eol,...
            'DataLine',headerlines+readVarNames+1,...
            'VariableNamesLine',readVarNames*(headerlines+readVarNames)...
            );
        
        if [otherArgs{find(strcmp(otherArgs,'MultipleDelimsAsOne'))+1}] %#ok<BDSCA>
            opts.ConsecutiveDelimitersRule = 'join';
            opts.LeadingDelimitersRule = 'ignore';
        end
    end
    
    if ~supplied.Whitespace
        whitespace = sprintf(opts.Whitespace);
        if supplied.Delimiter
            if ischar(delimiter) && isscalar(delimiter)
                whitespace(whitespace==delimiter)=[];
            else
                for d = delimiter
                    if isscalar(d{1})
                        whitespace(whitespace==d{1})=[];
                    end
                end
            end
        end
            % If whitespace was not specified, but a non-standard EOL was, add the
            % standard EOL chars to the whitespace.
        if supplied.EndOfLine && ~any(strcmp(eol,{'\r' '\n' '\r\n'}))
            whitespace = [whitespace mixedEOL];
        end
    end
    
    % Skip the header
    [~,endHeaderPos] = textscanReadLines(fid, 0, whitespace, eol, headerlines, otherArgs);
    if readVarNames
        vnline = getVariableNamesLine(inferFormat, fid, whitespace, eol, otherArgs);
        if isa(vnline,'double') && isempty(vnline) % file was empty, return empty table
            t = table.empty(0,0);
            fclose(fid);
            return
        end
    else
        vnline = '';
    end
    % Guess at a format string for the data.
    if inferFormat
            % Save current position (start of data) in file, make a guess at the
            % file format based on first line of data, step back to the start of the
            % data, and read in the data using the guessed-at format.
        startDataPosn = ftell(fid);
        format = guessFormat(fid, vnline, delimiter, whitespace, eol, treatAsEmpty, datetimetype, strcmp(opts.VariableTypes,'datetime'),otherArgs);
        fseek(fid, startDataPosn, 'bof');
        
    end
    
    numNonDataLines = headerlines + readVarNames;
    
    if inferFormat % read data using the detected format and update as it fails.
        rawData = textscanReadData(fid, format, delimiter, whitespace, eol, treatAsEmpty,otherArgs);
        
        if ~feof(fid)
                % textscan failed because some column had the "wrong" value in it,
                % i.e., a field in the first data line was numeric (or an empty
                % field), but a subsequent row had a non-numeric string in that
                % column.
            
                % Step back to the start of the data, make a more careful guess at
                % the format, step back again, and reread with the new format.
            fseek(fid, startDataPosn, 'bof');
            format = updateFormatGuess(fid, format, delimiter, whitespace, eol, treatAsEmpty, numNonDataLines, otherArgs);
            
            if ~supplied.ReadVariableNames
                    % May have detected no variable names, but then the format
                    % changed to all strings. In that case, read variable names
                if matlab.io.internal.text.fomatIsAllString(format)
                    readVarNames = true;
                    fseek(fid,endHeaderPos,'bof');
                    vnline = getVariableNamesLine(inferFormat, fid, whitespace, eol, otherArgs);
                    startDataPosn = ftell(fid);
                else
                    % Otherwise, recheck the vnline against the new format.
                    readVarNames = matlab.io.internal.text.detectVariableNames(format,vnline,delimiter,whitespace,eol,otherArgs);
                    if ~readVarNames
                        fseek(fid,endHeaderPos,'bof');
                        startDataPosn = endHeaderPos;
                    end
                end
                
            end
            fseek(fid, startDataPosn, 'bof');
            rawData = textscanReadData(fid, format, delimiter, whitespace, eol, treatAsEmpty,otherArgs);
            
            if ~feof(fid)
                    % Even the more careful format guess did not succeed. Reread the
                    % file treating all variables as strings.
                format(2:2:end) = 'q';
                fseek(fid,startDataPosn,'bof');
                    % If varnames was detected as false, and not supplied by the
                    % user, reset readVarNames at this point and skip the first data
                    % line as it will be read as variable names. All string output
                    % always reads varnames unles told otherwise.
                if ~supplied.ReadVariableNames && ~readVarNames
                    readVarNames = true;
                    fseek(fid,endHeaderPos,'bof');
                    vnline = getVariableNamesLine(inferFormat, fid, whitespace, eol, otherArgs);
                end
                rawData = textscanReadData(fid, format, delimiter, whitespace, eol, treatAsEmpty,otherArgs);
            end
        end
        
        
    else % ~inferFormat
        % Read in the data using the specified format.
        rawData = textscanReadData(fid, format, delimiter, whitespace, eol, treatAsEmpty, otherArgs);
        
        if ~feof(fid)
            m = message('MATLAB:readtable:CouldNotReadEntireFileWithFormat');
            baseME = MException(m.Identifier,'%s',getString(m));
                % If all the cells in rawData are the same length, textscan stopped
                % at the start of a line. If the first few cells have length L, and
                % the rest L-1, then textscan stopped mid-line. We can be helpful
                % here.
            varlens = cellfun(@(x)size(x,1),rawData);
            dvarlens = diff(varlens);
            locs = find(dvarlens);
            if isempty(locs) || (isscalar(locs) && dvarlens(locs)==-1)
                errLine = min(varlens) + 1 + numNonDataLines;
                m = message('MATLAB:readtable:ReadErrorOnLine',errLine);
                throw(addCause(baseME,MException(m.Identifier,'%s',getString(m))));
            else
                % Otherwise, something else happened for which we have no specific advice.
                throw(baseME);
            end
        end
    end
catch ME
    fclose(fid);
    reportParamsWithError(ME, detected, delimiter, headerlines, readVarNames, format);
end

isEndOfFile = feof(fid);
fclose(fid);

try
    if readVarNames
            % Determine the table's variable names from the variable names line in
            % the file.
        varNames = matlab.io.internal.text.determineVarNames(vnline,format,delimiter,whitespace,eol,true,otherArgs);
        
        if numel(varNames) ~= length(rawData)
            [id,collectOutput] = findParamValue('CollectOutput',otherArgs);
            if ~isempty(id) && collectOutput % collect output is true.
                    % This concatinates the variable names to match the output sizes
                    % when CollectOutput is true
                sizes = cellfun(@(c)size(c,2),rawData);
                startOffset = cumsum([0 sizes(1:end-1)]);
                numNewVars = numel(sizes);
                newVarNames = cell(1,numNewVars);
                for i = 1:numNewVars
                    oldNamesIdx = startOffset(i) + (1:sizes(i));
                    newVarNames{i} = strjoin(varNames(oldNamesIdx),'_');
                end
                varNames = newVarNames;
            else
                varNames = matlab.io.internal.text.determineVarNames([vnline eol],format,delimiter,whitespace,eol,true,otherArgs);
                if numel(varNames) > length(rawData) && all(cellfun(@isempty,varNames(length(rawData)+1:end)))
                    varNames(length(rawData)+1:end) = [];
                else
                    error(message('MATLAB:readtable:ReadVarNamesFailed',file,length(rawData),numel(varNames)));
                end
            end
        end
    else
            % If reading row names, number remaining columns beginning from 1, we'll
            % drop Var0 below.
        varNames = matlab.internal.table.dfltVarNames((1:length(rawData))-readRowNames);
    end
    
    if isempty(rawData) % i.e., if the file had no data
        t_data = cell(length(rawData));
    else
        columnLengths = cellfun(@(x)size(x,1),rawData);
        tooShort = columnLengths ~= columnLengths(1);
        if any(tooShort)
                % Some of the columns didn't read to completion. If the file ended,
                % then all the data was read and we can pad the output with empty
                % values.
            if isEndOfFile
                rawData = fillShortRows(rawData,tooShort,columnLengths(1),otherArgs);
            else % otherwise, issue an error.
                if inferFormat
                    error(message('MATLAB:readtable:UnequalVarLengthsFromFileWithFormat'));
                else
                    error(message('MATLAB:readtable:UnequalVarLengthsFromFileNoFormat'));
                end
            end
        end
        
        if readRowNames
            rowNames = rawData{1};
            if ischar(rowNames)
                rowNames = cellstr(rowNames);
            elseif isnumeric(rowNames)
                rowNames = sprintfc('%.15g',rowNames);
            elseif ~iscellstr(rowNames)
                error(message('MATLAB:readtable:RowNamesVarNotString', class(rowNames)));
            end
            rawData(1) = [];
            dimNames = matlab.internal.table.dfltDimNames;
            if readVarNames, dimNames{1} = varNames{1}; end
            varNames(1) = [];
        end
        t_data = rawData(:)';
    end
catch ME
    reportParamsWithError(ME,detected,delimiter,headerlines,readVarNames,format);
end
t = table(t_data{:});

% Set the var names.  These will be modified to make them valid, and the
    % original strings saved in the VariableDescriptions property.  Fix up
    % duplicate or empty names.
t.varDim = t.varDim.setLabels(varNames,[],true,true,true);

if ~isempty(rawData) && readRowNames
    t.rowDim = t.rowDim.setLabels(rowNames,[],true,true); % Fix up duplicate or empty names
    t.metaDim = t.metaDim.setLabels(dimNames,[],true,true,true); % Fix up duplicate, empty, or invalid names
end
if readVarNames
    % Make sure var names and dim names don't conflict. That could happen if var
    % names read from the file are the same as the default dim names (when ReadRowNames
    % is false), or same as the first dim name read from the file (ReadRowNames true).
    t.metaDim = t.metaDim.checkAgainstVarLabels(t.varDim.labels,'silent');
end
end


%-------------------------------------------------------------------------------
function format = guessFormat(fid,vnline,delimiter,whitespace,eol,treatAsEmpty,datetimetype,dates,otherArgs)
% Guess at the format string for the data, based on the first line of data.

% Read the first line of data as a single string.
dataLine = textscanReadLines(fid, 1, whitespace, eol, 0, otherArgs);

    % There is no data, use the variable names line to create an "all numeric" format.
if isempty(dataLine{1}) || isempty(dataLine{1}{1})
    if ~isempty(vnline)
        nvars = countDelimiters(vnline,delimiter, otherArgs)+1;
    else
        nvars = 0;
    end
    if nvars > 0
        format = repmat('%f',1,nvars);
    else
        format = '%*s'; % textscan does not accept an empty format
    end
else
    % Determine the format from the first line of data.
    format = matlab.io.internal.text.determineFormatString(dataLine{1}{1}, ...
        delimiter, whitespace, eol, treatAsEmpty, otherArgs);
    if strcmp(datetimetype,'datetime') && any(dates)
        format = matlab.iofun.internal.formatParser(format);
        format = format.Format;
        format(dates(1:min(numel(dates),numel(format)))) = {'%D'};
        format = [format{:}];
    end
end
end


%-------------------------------------------------------------------------------
function format = updateFormatGuess(fid,format,delimiter,whitespace,eol,treatAsEmpty,numNonDataLines,otherArgs)
    % Guess at the format string for the data, based on reading all lines of the file.

% Use the existing format as a starting point for the updated guess. We will
% parse each line with a format that reads fields, but generates no output. This
% is a fast way to determine if the format succeeds for a given line. Do this as
% a separate (single) pass through the file before (re)reading the data to avoid
% repeated passes that actually create data that is then thrown away due to
% failed format guesses.
nvars = numel(format)/2;
skipValuesFormat = repmat('%*f', 1, nvars);
skipValuesFormat(3:3:end) = format(2:2:end);

% Read blocks of 100 lines at a time, and keep a count.
blockSize = 100;
blockNum = 0;
while ~feof(fid)
    
    % Read a block of lines from the file as separate strings.
    blockNum = blockNum + 1;
    dataLines = textscanReadLines(fid, blockSize, whitespace, eol, 0, otherArgs);
    dataLines = cellstr(dataLines{1});
    
    if isempty(dataLines), break; end % reached end of file
    
    % Check the current format guess against each line.
    for jj = 1:numel(dataLines)
        
        % The file must be rectangular, with same number of delimiters on every line.
        numDelimiter = countDelimiters(dataLines{jj}, delimiter, otherArgs);
        if numDelimiter ~= nvars-1
            errLine = jj + blockSize*(blockNum-1) + numNonDataLines; % account for nonDataLines not read into rawline
            error(message('MATLAB:readtable:BadFileFormat', errLine, errLine, numDelimiter, nvars-1));
        end
        
        % Parse each line, without actually creating values, to determine if the current
        % format guess works for this line. 'EndOfLine' is not needed, but saves the
        % effort of trying to detect it.
        [~,pos] = textscan(dataLines{jj}, skipValuesFormat, 1, ...
            'Delimiter',delimiter,...
            'Whitespace',whitespace,...
            'TreatAsEmpty',treatAsEmpty,...
            'EndOfLine',eol,...
            otherArgs{:});
        
        % If parsing failed, update the format guess to use %q where %f failed.
        if pos ~= length(dataLines{jj})
            format = matlab.io.internal.text.determineFormatString(dataLines{jj}, ...
                delimiter, whitespace, eol, treatAsEmpty, otherArgs, format);
            skipValuesFormat(3:3:end) = format(2:2:end);
        end
        
        % updateFormatGuess does not go back and reparse the entire file again each time
        % a line fails -- only one pass trough the file. As a result, the updated format
        % may fail when we use it to actually read all the data. This happens only in
        % perverse cases where changing %f to %q when a line fails causes textscan to
        % interpret an earlier line differently. For example, although line 2 satisfies
            % the format guessed based on line 1, it does not satisfy the updated format after
            % reacting to line 3:
            %    1, 11, aa, 21, xx
            %    2, 12  bb, 22, yy,
            %    3, EE, cc, 23, zz
    end
end
end


%-------------------------------------------------------------------------------
function delimiter = validateDelims(delimiter)
% Check the delimiter, and convert 'space','comma', etc. into real delimiters.

tab = sprintf('\t');

if ischar(delimiter)
    % Convert aliases to real delimiters.
    switch delimiter
        case {'tab', '\t', tab}
            delimiter = tab;
        case {'space',' '}
            delimiter = ' ';
        case {'comma', ','}
            delimiter = ',';
        case {'semi', ';'}
            delimiter = ';';
        case {'bar', '|'}
            delimiter = '|';
        otherwise
            % Otherwise, pass to textscan.
    end
    delimiter = num2cell(sprintf(delimiter));
end

try
    textscan('a','%*s','Delimiter',delimiter);
    for k = 1:numel(delimiter)
        delimiter{k} = sprintf(delimiter{k});
    end
catch
    error(message('MATLAB:readtable:InvalidDelimiter'));
end
end


%-------------------------------------------------------------------------------
function [lines,pos] = textscanReadLines(fid,N,whitespace,eol,headerlines,otherArgs)
% Read N non-blank lines from the current position in an open file.

% Read each line (defined as "up to an EOL char") into a separate string. This
% will skip any leading or embedded blank lines, where "blank" is defined modulo
% white space. Specifying the EOL flag lets textscan skip over the unconsumed
% EOL chars as blank lines (and saves textscan the bother of figuring out EOL).
% The last EOL will not be consumed, but since textscan skips leading blank
% lines, subsequent calls won't care.
[lines,pos] = textscan(fid, ['%[^' eol ']'], N, ...
    'whitespace',whitespace, 'headerlines',headerlines, 'EndOfLine',eol, otherArgs{:});

end


%-------------------------------------------------------------------------------
function data = textscanReadData(fid,format,delimiter,whitespace,eol,treatAsEmpty,otherArgs)
% Read data from from current position in an open file using a specified format.

% textscan automatically skips blank lines, where "blank" is defined modulo
% white space. Even if there's nothing left in the file, textscan will return
% the right types in data.
data = textscan(fid, format, 'Delimiter',delimiter, 'whitespace',whitespace, ...
    'TreatAsEmpty',treatAsEmpty, 'EndOfLine',eol, otherArgs{:});
end


%-------------------------------------------------------------------------------
function vnline = getVariableNamesLine(inferFormat,fid,whitespace,eol,otherArgs)
% Read in the first line of var names as a single string, skipping header lines.
% This will not skip blank lines that precede the header lines, but will skip
% blank lines that precede the variable names line.
vnline = textscanReadLines(fid, 1, whitespace, eol, 0, otherArgs);
if isempty(vnline{1}) || isempty(vnline{1}{1})
    if inferFormat % empty file
        vnline = [];
        return
    else
        vnline = ' '; % whitespace
    end
else
    vnline = vnline{1}{1};
end
end


%-------------------------------------------------------------------------------
function [fid,file] = openFile(file,fileEncoding)
    fid = fopen(file,'rt','n',fileEncoding); % text mode: CRLF -> LF on windows (no-op on linux)
if fid == -1
    % Try again with default extension if there wasn't one
    [~,~,ext] = fileparts(file);
    if isempty(ext)
        file = [file '.txt'];
            fid = fopen(file,'rt','n',fileEncoding); % text mode: CRLF -> LF on windows (no-op on linux)
    end
end
if fid == -1
    error(message('MATLAB:readtable:OpenFailed',file));
end
end


%-------------------------------------------------------------------------------
function validateTextscanParameters(otherArgs)
% ReturnOnError doesn't make sense with READTABLE since it already errors when
% the file fails to parse
id = find(ismember(otherArgs(1:2:end),{'ReturnOnError'}));
if ~isempty(id)
    error(message('MATLAB:table:parseArgs:BadParamName',otherArgs{2*id-1}));
end
if ~isempty(otherArgs)
    try
            % Give error based on whether we have a textscan param, or something
            % completely unknown.
        textscan('a','%*s',otherArgs{:});
    catch ME
        if strcmp(ME.identifier,'MATLAB:textscan:UnknownOption')
            error(message('MATLAB:table:parseArgs:BadParamName',otherArgs{1}));
        end
            % textscan may fail to validate a parameter, throw the appropriate
            % message
        throw(ME);
    end
end
end


%-------------------------------------------------------------------------------
function data = fillShortRows(data,tooShort,len,otherArgs)
% Pass in otherArgs in case EmptyValue is set.
[emptyvalID,emptyNumericValue] = findParamValue('EmptyValue',otherArgs);
if isempty(emptyvalID)
    emptyNumericValue = NaN;
end

% Fill everything else
for colID = find(tooShort)
    if isnumeric(data{colID})
        data{colID}(end+1,:) = emptyNumericValue;
    elseif iscell(data{colID})
        data{colID}(end+1,:) = {''};
    else
        data{colID} = matlab.internal.table.lengthenVar(data{colID},len);
    end
end
end


%-------------------------------------------------------------------------------
function numDelimiter = countDelimiters(row, delimiter, otherArgs)
fields = textscan([row char(0)], '%q', 'Delimiter', delimiter, otherArgs{:},'CollectOutput',false);
numDelimiter = numel(fields{1})-1;
end

function [id,value] = findParamValue(param,args)
id = find(strcmp(args,param),1,'last');
if ~isempty(id)
    value = args{id+1};
else
    value = [];
end
end


%-------------------------------------------------------------------------------
function reportParamsWithError(ME,detected,delimiter,headerlines,readVarNames,format)
params = matlab.io.text.internal.reportParamsList(detected,delimiter,headerlines,readVarNames,format);
if ~isempty(params)
    msg = matlab.io.internal.utility.unescape(ME.message);
    throw(MException(ME.identifier,[msg '\n\n' getString(message('MATLAB:readtable:ParameterListHeader')) params]));
else
    throw(ME);
end
end