gusucode.com > bigdata 工具箱 matlab源码程序 > bigdata/@tall/private/histcountsData.m

    function[counts, edges, bin] = histcountsData(tallX, varargin)
%histcounstsData - histcounts on a tall array

% Copyright 2016 The MathWorks, Inc.

try
    opts = parseinput(varargin);
catch e
    throwAsCaller(e);
end

% Force tallX into a partition ordered column, used to compute summary
% statistics for bin methods that need it.
% Must copy the original adaptor to propagate the underlying class but
% the size (if known) must be reset after reshaping.
tX = chunkfun(@(x) x(:), tallX);
tX.Adaptor = resetSizeInformation(tallX.Adaptor);

if ~isempty(opts.BinLimits)
    % only count the values that fall within BinLimits
    withinBinLimits = tX>=opts.BinLimits(1) & tX<=opts.BinLimits(2);
    tX = filterslices(withinBinLimits, tX);
end

if ismember(opts.BinMethod, {'auto', 'scott', 'maxnumbins'})
    % We need double values in Scott's rule for edge computation
    xStats = matlab.bigdata.internal.util.getArrayStatistics(elementfun(@iCastToDouble, tX));
else
    xStats = matlab.bigdata.internal.util.getArrayStatistics(tX);
end

if isempty(opts.BinEdges)
    % Compute edges according to inputs
    if ~isempty(opts.NumBins)
        [edgesFcn, edgesFcnArgs] = getNumBinsEdgeFun(opts.NumBins, xStats, opts.BinLimits);
    elseif ~isempty(opts.BinWidth)
        [edgesFcn, edgesFcnArgs] = getBinWidthEdgeFun(opts.BinWidth, xStats, opts.BinLimits);
    else
        % BinMethod code path
        [edgesFcn, edgesFcnArgs] = getBinMethodEdgeFun(tX, opts.BinMethod, xStats, opts.BinLimits, opts.MaxNumBins);
    end
    
    edges = clientfun(edgesFcn, edgesFcnArgs{:});
    edges = clientfun(@compressEdges, edges);
else
    % use supplied edge vector
    edges = opts.BinEdges;
    edges = compressEdges(edges);
end

[countIndices, counts] = aggregatefun(@partialHistcounts, @histcountsCombiner, tX, edges);
counts = clientfun(@reshapeHistcountsOutput, countIndices, counts, edges);

counts = normalizeCounts(counts, edges, opts.Normalization);

if nargout > 2
    % Compute the bin indices using the possibly compressed edges
    bin = elementfun(@getBinIndices, tallX, matlab.bigdata.internal.broadcast(edges));    
end

if nargout > 1
    if isempty(opts.BinEdges)
        % Decompress the edges when they are requested as an output argument
        % but only *after* we've computed the bin indices (if also requested)
        edges = clientfun(@decompressEdges, edges);
    else
        % return the supplied local edges, unmodified.
        edges = opts.BinEdges;
    end
end

end

function x = iCastToDouble(x)
if ~isfloat(x)
    x = double(x);
end
end

% Implementation copied from toolbox/matlab/datafun/histcounts.m
function opts = parseinput(input)

opts = struct('NumBins',[], 'MaxNumBins', getmaxnumbins(), 'BinEdges',[],...
    'BinLimits',[],'BinWidth',[],'Normalization','count','BinMethod','auto');

% Must report histcounts in any exception ids that propagate out of here
funcName = 'histcounts';

% Parse second input in the function call
if ~isempty(input)
    in = input{1};
    inputoffset = 0;
    if isnumeric(in) || islogical(in)
        if isscalar(in)
            validateattributes(in,{'numeric','logical'},{'integer', 'positive'}, ...
                funcName, 'm', inputoffset+2)
            opts.NumBins = in;
            opts.BinMethod = '';
        else
            validateattributes(in,{'numeric','logical'},{'vector','nonempty', ...
                'real', 'nondecreasing'}, funcName, 'edges', inputoffset+2)
            opts.BinEdges = in;
            opts.BinMethod = '';
        end
        input(1) = [];
        inputoffset = 1;
    end
    
    % All the rest are name-value pairs
    inputlen = length(input);
    if rem(inputlen,2) ~= 0
        error(message('MATLAB:histcounts:ArgNameValueMismatch'))
    end
    
    for i = 1:2:inputlen
        name = validatestring(input{i}, {'NumBins', 'MaxNumBins', 'BinEdges', 'BinWidth', 'BinLimits', ...
            'Normalization', 'BinMethod'}, i+1+inputoffset);
        
        value = input{i+1};
        switch name
            case 'NumBins'
                validateattributes(value,{'numeric','logical'},{'scalar', 'integer', ...
                    'positive'}, funcName, 'NumBins', i+2+inputoffset)
                opts.NumBins = double(value);
                if ~isempty(opts.BinEdges)
                    error(message('MATLAB:histcounts:InvalidMixedBinInputs'))
                end
                opts.BinMethod = '';
                opts.BinWidth = [];
            case 'MaxNumBins'
                validateattributes(value,{'numeric','logical'},{'scalar', 'integer', ...
                    'positive', '<=', getmaxnumbins}, funcName, 'NumBins', i+2+inputoffset)
                opts.MaxNumBins = double(value);
                opts.BinMethod = 'maxnumbins';
            case 'BinEdges'
                validateattributes(value,{'numeric','logical'},{'vector', ...
                    'real', 'nondecreasing'}, funcName, 'BinEdges', i+2+inputoffset);
                if length(value) < 2
                    error(message('MATLAB:histcounts:EmptyOrScalarBinEdges'));
                end
                opts.BinEdges = value;
                opts.BinMethod = '';
                opts.NumBins = [];
                opts.BinWidth = [];
                opts.BinLimits = [];
            case 'BinWidth'
                validateattributes(value, {'numeric','logical'}, {'scalar', 'real', ...
                    'positive', 'finite'}, funcName, 'BinWidth', i+2+inputoffset);
                opts.BinWidth = double(value);
                if ~isempty(opts.BinEdges)
                    error(message('MATLAB:histcounts:InvalidMixedBinInputs'))
                end
                opts.BinMethod = '';
                opts.NumBins = [];
            case 'BinLimits'
                validateattributes(value, {'numeric','logical'}, {'numel', 2, 'vector', 'real', ...
                    'nondecreasing', 'finite'}, funcName, 'BinLimits', i+2+inputoffset)
                opts.BinLimits = value;
                if ~isempty(opts.BinEdges)
                    error(message('MATLAB:histcounts:InvalidMixedBinInputs'))
                end
                
                if ~isfloat(opts.BinLimits)
                    % for integers, the edges are doubles
                    opts.BinLimits = double(opts.BinLimits);
                end
            case 'Normalization'
                opts.Normalization = validatestring(value, {'count', 'countdensity', 'cumcount',...
                    'probability', 'pdf', 'cdf'}, funcName, 'Normalization', i+2+inputoffset);
            otherwise % 'BinMethod'
                opts.BinMethod = validatestring(value, {'auto','scott', ...
                    'integers', 'sturges', 'sqrt'}, funcName, 'BinMethod', i+2+inputoffset);
                if ~isempty(opts.BinEdges)
                    error(message('MATLAB:histcounts:InvalidMixedBinInputs'))
                end
                opts.BinWidth = [];
                opts.NumBins = [];
        end
    end
end
end

% Implementation copied from toolbox/matlab/datafun/histcounts.m
function mb = getmaxnumbins
mb = 65536;  %2^16
end

function [edgesFcn, edgesFcnArgs] = getNumBinsEdgeFun(N, xStats, limits)
edgesFcn = @matlab.bigdata.internal.binmethods.numbinsrule;
edgesFcnArgs = {N, xStats.min, xStats.max, limits};
end

function [edgesFcn, edgesFcnArgs] = getBinWidthEdgeFun(binWidth, xStats, limits)
edgesFcn = @matlab.bigdata.internal.binmethods.binwidthrule;
edgesFcnArgs = {binWidth, xStats.min, xStats.max, limits, getmaxnumbins()};
end

function [edgesFcn, edgesFcnArgs] = getBinMethodEdgeFun(tX, binMethod, xStats, limits, maxNumBins)
import matlab.bigdata.internal.binmethods.autorule
import matlab.bigdata.internal.binmethods.scottsrule
import matlab.bigdata.internal.binmethods.integersrule
import matlab.bigdata.internal.binmethods.sturgesrule
import matlab.bigdata.internal.binmethods.sqrtrule
import matlab.bigdata.internal.binmethods.maxnumbinsrule

switch binMethod   
    case 'auto'
        edgesFcn = @autorule;
        preferIntegerRule = lazyCheckAutoRulePreferIntegersRule(tX);
        edgesFcnArgs = {preferIntegerRule, xStats.min, ...
            xStats.max, xStats.std, xStats.numel, limits, maxNumBins};
    case 'scott'
        edgesFcn = @scottsrule;
        edgesFcnArgs = {xStats.std, xStats.numel, xStats.min, xStats.max, limits};
    case 'integers'
        edgesFcn = @integersrule;
        edgesFcnArgs = {xStats.min, xStats.max, limits, maxNumBins};
    case 'sqrt'
        edgesFcn = @sqrtrule;
        edgesFcnArgs = {xStats.numel, xStats.min, xStats.max, limits};
    case 'sturges'
        edgesFcn = @sturgesrule;
        edgesFcnArgs = {xStats.numel, xStats.min, xStats.max, limits};
    case 'maxnumbins'
        edgesFcn = @maxnumbinsrule;
        preferIntegerRule = lazyCheckAutoRulePreferIntegersRule(tX);
        edgesFcnArgs = {preferIntegerRule, xStats.min, ...
            xStats.max, xStats.std, xStats.numel, limits, maxNumBins};
end
end

function preferIntRule = lazyCheckAutoRulePreferIntegersRule(tX)
% Prefer to use scotts rule when the underlying type is either single or
% double, as opposed to some kind of int or logical, where the integers
% rule should be preferred instead.
% Otherwise, the integer rule is used when the input array is equal to the
% rounded one.

classCheckFcn = @(cX) ~ismember(cX, {'single', 'double'});
hasCorrectClass = clientfun(classCheckFcn, classUnderlying(tX));
roundsToInt = aggregatefun(@(x) isequal(x, round(x)), @all, tX);
preferIntRule = clientfun(@(a,b) (a||b), hasCorrectClass, roundsToInt);
end

function [countIndices, counts] = partialHistcounts(x, edges)
edges = decompressEdges(edges);
counts = histcounts(x, edges)';
countIndices = find(counts ~= 0);
counts = counts(countIndices);
end

function [countIndices, counts] = histcountsCombiner(countIndices, counts)
[countIndices, ~, idx] = unique(countIndices(:));
counts = accumarray(idx, counts(:));
end

function allCounts = reshapeHistcountsOutput(countIndices, counts, edges)
edges = decompressEdges(edges);
numBins = numel(edges)-1;
allCounts = zeros(1, numBins);
allCounts(countIndices) = counts;
end

function edges = compressEdges(edges)
% Check whether edges == colon(min(edges), step, max(edges))
% where step = unique(diff(edges)) and step is a positive scalar
step = unique(diff(edges));

if all(isfinite(edges)) && isscalar(step) && step > 0
    % Compress the edges into a cell array of args for colon operator
    % This is done to avoid unnecessarily communicating a large row vector
    edges = {min(edges), step, max(edges)};
end
end

function edges = decompressEdges(edges)
if iscell(edges)
    % reconstruct the edges vector
    edges = colon(edges{:});
end
end

function counts = normalizeCounts(counts, edges, normType)

if strcmpi(normType, 'count')
    % this is the default so nothing to do here.
    return;
end

switch normType
    case 'countdensity'
        normFcn = @(n, e) n./double(diff(decompressEdges(e)));
    case 'cumcount'
        normFcn = @(n, ~) cumsum(n);
    case 'probability'
        normFcn = @(n, ~) n / sum(n);
    case 'pdf'
        normFcn = @(n, e) n/sum(n)./double(diff(decompressEdges(e)));
    case 'cdf'
        normFcn = @(n, ~) cumsum(n / sum(n));
end

% There is an assumption here that counts and edges will fit on client
% numel(edges) <= getmaxnumbins = 2^16 so this seems plausible.
counts = clientfun(normFcn, counts, edges);
end

function bins = getBinIndices(x, edges)
edges = decompressEdges(edges);
bins = discretize(x, edges);
bins(isnan(bins)) = 0;
end