Source: univariate.js

'use strict';

const { Binning } = require( './binning.js' );

const version = 'stats-logscale/univariate@1.0'; // semver w/o patch part

/**
 *  @classdesc Univariate statistical distribution analysis tool
 *  It works by sorting data into bins.
 *  This bin size depends on absolute & relative precision
 *  of the incoming data.
 *  Thus, very large samples can be processed fast
 *  with reasonable memory usage.
 */
class Univariate extends Binning {
    /**
     * @param {Object} args
     * @param {Number} [args.base]      Must be between 1 and 1.5. Default is 1.001
     * @param {Number} [args.precision] Must be positive. Default is 1E-9.
     * @param {Array}  [args.bins]      See addWeighted() for description
     */
    constructor (args = {}) {
        super(args);
        this.storage = {}; // str(bin) => [ count, num(bin) ]
        this._count = 0;
        this._cache = {};
        this.neat = new Neat(this);

        if (args.bins)
            this.addWeighted(args.bins);
    }

    /**
     * @desc Add value(s) to sample.
     * @param {...Number} data Number(s) to add to sample
     * @returns {Univariate} this (chainable)
     * @example
     * for (let i=0; i<10000; i++)
     *    stat.add(-Math.log(Math.random()));
     * // creates exponential distribution
     * @example
     * stat.add( 1,2,3,4,5,6 );
     * // a d6
     */
    add ( ...data ) {
        this._cache = {};
        data.forEach( x => {
            const bin = this.round(x);
            if ( !this.storage[bin] )
                this.storage[bin] = [0, bin];
            this.storage[bin][0]++;
            this._count++; // round() may throw, so increase counter one by one
        });
        return this;
    }

    /**
     * @desc Add values to sample, with weights.
     * @param {number[][]} pairs Each pair is an array with two numbers:
     * [ value, quantity ]. Negative quantity is allowed and means we're erasing data.
     * @returns {Univariate} this (chainable)
     * @example
     * stat.addWeighted( [ [0.1, 5], [0.2, 4], [0.3, 3] ] )
     * // adds 0.1 x 5, 0.2 x 4, 0.3 x 3
     */
    addWeighted ( pairs ) {
        this._cache = {};
        // TODO validate
        pairs.forEach( entry => {
            const x = entry[0];
            const n = Number.parseFloat( entry[1] ); // fractional weights possible
            if (Number.isNaN(n))
                throw new Error('Attempt to provide a non-numeric weight');

            const bin = this.round( x );
            if ( !this.storage[bin] )
                this.storage[bin] = [0, bin];
            this.storage[bin][0] += n;
            this._count += n;

            if (this.storage[bin][0] <= 0) {
                // Negative weight => we've "forgetting" data
                this._count -= this.storage[bin][0];
                delete this.storage[bin];
            }
        });
        return this;
    }

    /**
     *  @desc Serialization of the sample.
     *  @returns {Object} plain data structure that can serve
     *      as an argument to new().
     */
    toJSON () {
        return {
            version,
            precision: this.getPrecision(),
            base:      this.getBase(),
            bins:      this.getBins(),
        }
    }

    /**
     *   @desc create a copy of sample object, possibly modifying precision
     *   settings and/or filtering data.
     *   @param {Object} [args]
     *   @param {Number} [args.precision] Override absolute precision
     *   @param {Number} [args.base] Override relative precision
     *   @param {Number} [args.min] Filter values less than this
     *   @param {Number} [args.max] Filter values greater than this
     *   @param {Number} [args.ltrim] Filter values less than Xth percentile
     *   @param {Number} [args.rtrim] Filter values greater than 100-Xth percentile
     *   @param {Boolean} [args.winsorize] If a data point doesn't fit the bounds,
     *       truncate it instead of discarding.
     *   @param {function(Number): Number} [args.transform] Apply function to sample data
     *   @returns {Univariate} copy of the original object
     */
    clone (args = {}) {
        // TODO better name?
        let bins = this.getBins(args);
        if (args.transform)
            bins = bins.map( x => [args.transform(x[0]), x[1]] );

        return new Univariate( {
            precision: args.precision ?? this.getPrecision(),
            base:      args.base      ?? this.getBase(),
            bins,
        } );
    }

    /**
     *  @desc  Returns a sorted list of pairs containing numbers in the sample
     *          and their respective counts.
     *          See addWeighted().
     */
    getBins (args) {
        if (!this._cache.data) {
            this._cache.data = Object.values( this.storage )
                .map( bin => [bin[1], bin[0]] )
                .sort( (x, y) => x[0] - y[0] );
        }
        if (!args)
            return this._cache.data;

        const min = Math.max(
            args.min ?? -Infinity,
            this.percentile( args.ltrim ?? 0 ),
        );
        const max = Math.min(
            args.max ?? +Infinity,
            this.percentile( 100 - (args.rtrim ?? 0) ),
        );
        // TODO allow to skip buckets with too little data - param name???

        if (!args.winsorize)
            return this._cache.data.filter( x => x[0] >= min && x[0] <= max );

        const first = [this.round(min), 0];
        const last  = [this.round(max), 0];
        const out = [first];
        for (const [bin, count] of this._cache.data) {
            if (bin <= first[0])
                first[1] += count;
            else if (bin >= last[0])
                last[1] += count;
            else out.push([bin, count]);
        }
        if (last[1] > 0)
            out.push(last);
        return out;
    }

    /**
     *   @desc     Number of values in the sample.
     *   @returns   {Integer} count
     */
    count () {
        return this._count;
    }

    /**
     *   @desc Minimal value in the sample.
     *          This value is somewhat rounded down to guarantee
     *          it is less than _any_ value in the sample.
     *   @returns {Number} Minimum value
     */
    min () {
        const data = this.getBins();
        return this.lower(data[0][0]);
    }

    /**
     *   @desc Maximal value in the sample.
     *          This value is somewhat rounded up to guarantee
     *          it is greater than _any_ value in the sample.
     *   @returns {Number} Maximum value
     */
    max () {
        const data = this.getBins();
        return this.upper(data[data.length - 1][0]);
    }

    /**
     *   @desc Sum of arbitrary function over the sample.
     *   @param {Function} fun Number->Number
     *   @returns {Number}
     *   @example
     *   stat.sumOf( x => 1 ); // same as stat.count()
     *   @example
     *   stat.sumOf( x => x ); // same as stat.count() * stat.mean()
     */
    sumOf (fun) {
        let s = 0;
        Object.values(this.storage).forEach( bin => { s += bin[0] * fun(bin[1]) } );
        return s;
    }
    // TODO integralOf that takes bucket width into account

    /**
     *  @desc Calculate expected value of a given function over the sample.
     *  @param {function(Number): Number} fun
     *  @returns {Number}
     */
    E (fun) {
        return this._count ? this.sumOf( fun ) / this._count : undefined;
    }

    /**
     * @desc Average value of the sample.
     * @returns {Number}
     */
    mean () {
        return this._count ? this.sumOf( x => x ) / this._count : undefined;
    }

    /**
     * @desc Standard deviation of the sample.
     * Bessel's correction is used:
     * stdev = sqrt( E<(x - E<x>)**2> * n/(n-1) )
     * @returns {Number} Standard deviation
     */
    stdev () {
        // TODO better corrections?
        if (this._count < 2)
            return undefined;
        const mean = this.mean();
        return Math.sqrt( this.sumOf( x => (x - mean) * (x - mean) )
            / (this._count - 1) ); // Bessel's correction
    }

    /**
     *  @desc Skewness is a measure of the asymmetry of a distribution.
     *  Equals to 3rd standardized moment times n^2/(n-1)(n-2) correction
     *  Undefined if there are less than 3 data points.
     *  @returns {Number | undefined}
     */
    skewness () {
        const n = this.count();
        if (n < 3)
            return;
        const correction = n * n / ((n - 1) * (n - 2));
        return correction * this.momentStd(3);
    }

    /**
     *  @desc Kurtosis is a measure of how much of the distribution is
     *        contained in the "tails".
     *        Equals to 4th standardized moment minus 3,
     *        with a correction.
     *  @returns {Number | undefined}
     */
    kurtosis () {
        const n = this.count();
        if (n < 4)
            return;

        // taken from https://en.wikipedia.org/wiki/Kurtosis
        // not sure where it comes from
        // but if Excel is doing that, so do we.
        const corr1 = n * n * (n + 1) / ((n - 1) * (n - 2) * (n - 3));
        const corr2 = (n - 1) * (n - 1) / ((n - 2) * (n - 3));

        return this.momentStd(4) * corr1 - 3 * corr2;
    }

    /**
     *  @desc Moment of nth power, i.e. E((x-offset)**power)
     *  @param {Integer} power Power to raise to.
     *  @param {Number} [offset] Number to subtract. Default is mean.
     *  @returns {Number}
     */
    moment (power, offset) {
        if (!Number.isInteger(power))
            throw new Error('Cannot calculate non-integer moment (did you mean momentAbs?)');
        if (offset === undefined)
            offset = this.mean();
        return this.E( x => (x - offset) ** power );
    }

    /**
     *  @desc Absolute moment of nth power, i.e. E(|x-offset|**power)
     *  @param {Number} power Power to raise to. May be fractional. Default is 1.
     *  @param {Number} [offset] Number to subtract. Default is mean.
     *  @returns {Number}
     */
    momentAbs (power = 1, offset) {
        if (offset === undefined)
            offset = this.mean();
        return this.E( x => Math.abs(x - offset) ** power );
    }

    /**
     *  @desc Standardized moment of nth power, i.e. nth moment / stdev**n.
     *  @param {Integer} power
     *  @returns {Number}
     */
    momentStd (power) {
        return this.moment(power) / this.stdev() ** power;
    }

    /**
     *  @desc A number x such that P(value <= x) === p
     *  @param {Number} p from 0 to 1
     *  @return {Number} value
     *  @example
     *  const stat = new Univariate();
     *  stat.add( 1,2,3,4,5 );
     *  stat.quantile( 0.2 ); // slightly greater than 1
     *  stat.quantile( 0.5 ); // 3
     */
    quantile (p) {
        const target = p * this._count;

        const cumulative = this._cumulative();

        let l = 0;
        let r = cumulative.length;

        // console.log('target=' + target);

        while ( l + 1 < r ) {
            const m = Math.floor( (r + l) / 2 );
            // console.log( '['+l+', '+r+'): middle='+m+':', cumulative[m]);

            if (cumulative[m][1] >= target)
                r = m;
            else
                l = m;
        }

        const start = this.lower(cumulative[l][0]);
        const width = this.upper(cumulative[l][0]) - start;

        // Division by zero must not happen as zero-count buckets
        // should not exist.
        return start + width * (target - cumulative[l][1]) / (cumulative[l][2] - cumulative[l][1]);
    }

    /**
     *  @desc Returns x such that P(value < x) === p%.
     *        Same as quantile(p/100).
     *  @param {Number} p
     *  @returns {Number} x
     */
    percentile (p) {
        return this.quantile( p / 100 );
    }

    /**
     *  @desc Returns x such that half of the sample is less than x.
     *        Same as quantile(0.5).
     *  @returns {Number} x
     */
    median () {
        return this.quantile(0.5);
    }

    /**
     *  @desc Cumulative distribution function, i.e. P(value < x).
     *  This is the inverse of quantile.
     *  @param {Number} x
     *  @returns {Number} probability
     */
    cdf (x) {
        return this._rawCdf(x) / this._count;
    }

    _rawCdf (x) {
        const cumulative = this._cumulative();
        const lookup = this.round(x);

        // binary search
        // Look for the leftmost bucket >= round(x)
        // Count = total to the left of that bucket + maybe partial
        let l = 0;
        let r = cumulative.length;

        while (l < r) {
            const m = Math.floor((r + l) / 2);
            // console.log('['+l+', '+r+'): mid='+m+'; bin=', cumulative[m]);
            if (cumulative[m][0] < lookup)
                l = m + 1;
            else
                r = m;
        }

        // console.log('Looked for '+x+', found: ', [cumulative[l - 1], cumulative[l]] );

        if (l >= cumulative.length)
            return this._count;

        const prior = l > 0 ? cumulative[l - 1][2] : 0;
        const partial = lookup !== cumulative[l][0]
            ? 0
            : (cumulative[l][2] - cumulative[l][1]) // x'th bucket total
                * (x - this.lower(x))               // part left of x
                / (this.upper(x) - this.lower(x))   // bucket width

        return prior + partial;
    }

    /**
     *   @desc Histogram based on the sample
     *   @param {Object} args
     *   @param {Integer} [args.count] Number of bars in the histogram.
     *                                 Default is 10.
     *   @param {Number}  [args.scale] If given, make sure it's
     *   the height of the highest bar.
     *   @return {Array}  Array of triplets: [barHeight, leftBorder, rightBorder ].
     *   rightBorder equals to the next bar's leftBorder.
     */
    histogram (args = {}) {
        // TODO options
        if (!this._count)
            return [];
        const min = this.min();
        const max = this.max();
        const count = args.count || 10;

        const hist = []; // [ count, lower, upper ], ...
        let edge = min;
        const step = (max - min) / count;
        for (let i = 0; i < count; i++)
            hist.push( [this._rawCdf(edge + step), edge, edge += step] );

        // Differenciate (must go backward!)
        for (let i = hist.length; i-- > 1; )
            hist[i][0] -= hist[i - 1][0];

        hist[0][0] -= this._rawCdf(min);

        if (args.scale) {
            // scale to a factor e.g. for drawing pictures
            let max = 0;
            for (let i = 0; i < hist.length; i++) {
                if (max < hist[i][0])
                    max = hist[i][0];
            }

            for (let i = 0; i < hist.length; i++)
                hist[i][0] = hist[i][0] * args.scale / max;
        }

        return hist;
    }

    _cumulative () {
        // integral of sorted bins
        // [ [ bin_center, sum_before, sum_after ], ... ]
        if (!this._cache.cumulative) {
            const data = this.getBins();
            const cumulative = [];
            let sum = 0;
            for (let i = 0; i < data.length; i++)
                cumulative.push( [data[i][0], sum, sum += data[i][1]] );

            this._cache.cumulative = cumulative;
        }
        return this._cache.cumulative;
    }
}

// Memoize! Replace methods with cached counterparts
// '+' at the end if method has arguments
[
    'cdf+',
    'kurtosis',
    'max',
    'mean',
    'min',
    'moment+',
    'momentAbs+',
    'momentStd+',
    'quantile+',
    'skewness',
    'stdev',
].forEach( method => {
    const hasArg = !!method.match(/\+/);
    if (hasArg)
        method = method.replace( '+', '' );
    const orig = Univariate.prototype[method];
    if (typeof orig !== 'function')
        throw new Error('method "' + method + '" is cached but never defined');
    Univariate.prototype[method] = hasArg
        ? function (...arg) {
            if (this._count === 0)
                return undefined;
            if (this._cache[method] === undefined)
                this._cache[method] = {};
            const key = arg.join(':');
            if (this._cache[method][key] === undefined)
                this._cache[method][key] = orig.apply( this, arg );
            return this._cache[method][key];
        }
        : function () {
            if (this._count === 0)
                return undefined;
            if (this._cache[method] === undefined)
                this._cache[method] = orig.apply( this );
            return this._cache[method];
        };
});

class Neat {
    constructor (main) {
        this._main = main;
    }

    min () {
        if (!this._main._count)
            return undefined;
        const data = this._main.getBins();
        return this._main.shorten(data[0][0]);
    }

    max () {
        if (!this._main._count)
            return undefined;
        const data = this._main.getBins();
        return this._main.shorten(data[data.length - 1][0]);
    }
}

[
    'E',
    'kurtosis',
    'mean',
    'median',
    'moment',
    'momentAbs',
    'momentStd',
    'percentile',
    'quantile',
    'skewness',
    'stdev',
    'sumOf',
].forEach( fun => {
    Neat.prototype[fun] = function (arg) {
        return this._main.shorten( this._main[fun](arg) );
    }
});

[
    'cdf',
    'count',
].forEach( fun => {
    Neat.prototype[fun] = function (arg) {
        return this._main[fun](arg);
    }
});

module.exports = { Univariate };