Source code for matviz.histogram_utils



import matplotlib.pyplot as plt
import copy
import numpy as np
import matplotlib.dates as md
import datetime
from numpy import log10
import seaborn as sns
from .etl import handle_dates
from .etl import eps
from .etl import flatten, unflatten
from scipy.ndimage import gaussian_filter

import logging




# exclude extremes will ignore the things outside the bounds,

[docs]
def nhist(X,f=1.2,title=None,xlabel=None,ylabel=None,labels=None,legend=None,noerror=False,
          max_bins=175,std_times=4,color=None,normalize=False,same_bins_flag=False,int_bins_flag=None,
          maxx=None, minx=None, exclude_extremes=False, alpha = .4):
    """
    Plot one or more histograms with automatic binning.

    Automatically sets the number and range of bins using Scott's normal
    reference rule. Compares multiple datasets on one plot with legend,
    mean, and standard deviation markers.

    Parameters
    ----------
    X : list, dict, or DataFrame
        The data to histogram. Can be a single list/array, a list of
        lists, a dictionary of arrays, or a pandas DataFrame.
    f : float, optional
        Factor applied to Scott's normal reference rule. Higher values
        produce more bins. Default is 1.2.
    title : str, optional
        Title for the plot.
    xlabel : str, optional
        Label for the x-axis.
    ylabel : str, optional
        Label for the y-axis. Default is inferred from other settings.
    labels : list of str, optional
        Legend labels for each dataset.
    legend : list of str, optional
        Alias for *labels*.
    noerror : bool, optional
        If True, suppress the mean/std error bars. Default is False.
    max_bins : int, optional
        Maximum number of bins allowed. Default is 175.
    std_times : float, optional
        Number of standard deviations to show on each side.
        Data beyond this is bunched into edge bars. Default is 4.
    color : list or str, optional
        List of RGB colors, a seaborn palette, or a palette name string.
    normalize : bool or str, optional
        Controls the y-axis. ``False`` uses counts for single plots and
        PDF for multiple. Options: ``'frac'``, ``'proportion'``,
        ``'percent'``, ``'number'``, ``'none'``, or ``True`` for PDF.
    same_bins_flag : bool, optional
        Force all datasets to use identical bin edges. Default is False.
    int_bins_flag : bool, optional
        Force bin edges onto integers. Default is None (auto-detect).
    maxx : float, optional
        Maximum x-axis limit.
    minx : float, optional
        Minimum x-axis limit.
    exclude_extremes : bool, optional
        If True, exclude the edge bars that collect data beyond
        *std_times*. Default is False.
    alpha : float, optional
        Transparency of the histogram bars. Default is 0.4.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The current figure. Histogram data is attached as ``fig.nhist``,
        a dict with keys ``'N'`` (counts), ``'bins'`` (bin edges), and
        ``'rawN'`` (raw counts before normalization).

    Examples
    --------
    >>> A = {'mu=0': np.random.randn(10**5), 'mu=2': np.random.randn(10**3) + 2}
    >>> fig = nhist(A)
    >>> fig = nhist(A, color='viridis')
    >>> fig = nhist(A, same_bins_flag=True)
    """

    # do we really want to have redundant labels?
    if legend:
        labels = legend

    # if you passed a pandas DataFrame:
    if str(type(X)) == "<class 'pandas.core.frame.DataFrame'>":
        X = X.to_dict('list')

    # if you passed a dictionary, turn it into a list
    if isinstance(X,dict):
        if legend:
            logging.warning('passed in legend is being ignored. Since you passed a dict there ' + \
                  'would be no way for us to enforce that the order be the same')
        labels = list(X.keys())
        X = list(X.values())

    # in case we only have one item total to graph
    if not hasattr(X[0], "__len__"):
        X = [X]

    X = [list(x) for x in X]


    X, datetime_flag, date_formatting = handle_dates(X)

    # remove nones or nans
    X = [[x1 for x1 in x if x1 is not None and not np.isnan(x1)] for x in X]

    # count and remove -infs and infs
    inf_counts = [ [sum(np.array(x) == -np.inf), sum(np.array(x) == np.inf)] for x in X]
    X = [[x1 for x1 in x if np.isfinite(x1)] for x in X]



    S = len(X)

    bin_factor = f


    multiple_flag = len(X)>1 # normal is separate hists for each guy you plot instead of them all on the same bit.
    fracylabel_flag = False
    if normalize:
        # correct for shorthand input
        if normalize == 'pcnt':
            normalize = 'percent'

        if normalize in ['frac','proportion','prop','percent']:
            normalize_flag = True
            # if normalize in ['frac']:
            fracylabel_flag = True
            # else:
            #     fracylabel_flag = False
        elif normalize in {'number', 'none'}:
            normalize_flag = False
        else:
            normalize_flag = normalize


    elif normalize == False:
        normalize_flag = multiple_flag
    else: # must be set to none specifically
        normalize_flag = False

    num_points = [len(x) for x in X]
    X_std = [np.std(x) for x in X]
    X_mean = [np.mean(x) for x in X]


    # n_bins = np.min([n_bins, 1000])
    # n_bins = np.max([n_bins,10])
    bins_in, bin_widths = choose_bins(X,bin_factor=bin_factor,max_bins=max_bins,std_times=std_times,
                                      sameBinsFlag=same_bins_flag,maxx=maxx,minx=minx,int_bins_flag=int_bins_flag,
                                      exclude_extremes=exclude_extremes)

    # todo: change the error here to a warning, and set labels to the default
    if labels:
        if len(labels) != S:
            logging.error("Your labels list length " + str(len(labels)) + " doesn't equal the number of elements in your data: " + str(S))
    else:
        labels = [str(w) for w in range(S)]


    # calculate the histogram:
    N = [np.histogram(x, bins=bin)[0] for x,bin in zip(X,bins_in)]

    if not exclude_extremes:
        # add in those infinity's that we removed earlier
        for ii, cur_counts in enumerate(inf_counts):
            N[ii][0] += cur_counts[0]
            N[ii][-1] += cur_counts[1]



    def adjust_bins_to_plot(bins, bin_width):
        bins[0] = bins[1] - bin_width
        bins[-1] = bins[-2] + bin_width
        return bins

    if exclude_extremes:
        bins_to_plot = bins_in
    else:
        bins_to_plot = [adjust_bins_to_plot(cur_bins, cur_width) for cur_bins, cur_width in zip(bins_in, bin_widths)]

    rawN=copy.deepcopy(N) # save the rawN before normalization

    if normalize_flag:
        #     n   = (each value)/(width of a bin * total number of points)
        #     n   =  n /(Total area under histogram);
        if fracylabel_flag:
            for k in range(S):
                N[k]=100.0*N[k]/(num_points[k])
        else:
            for k in range(S):
                N[k]=N[k]/(bin_widths[k]*num_points[k])

    # current_palette = sns.color_palette("Set2",5)
    if color is not None:
        if type(color) == str:
            current_palette = sns.color_palette(color, S)
        else:
            current_palette= color
    else:
        current_palette = sns.color_palette("Set1",S)
        current_palette = current_palette[::-1]
        # switch red and blue if you want ...
        # if len(current_palette)>1:
        #     tmp = current_palette[0]
        #     current_palette[0]=current_palette[1]
        #     current_palette[1]=tmp
        # make it so that blue is the color if only one color is used.
        if len(current_palette)==1:
            current_palette = sns.color_palette("Set1",2)
            current_palette = [current_palette[1]]

    # sns.palplot(current_palette)

    N_max = max([max(n) for n in N])

    for k in range(S):
        # add a zero to the end here for that last non-bin
        plt.bar(bins_to_plot[k], np.append(N[k],0), width=bin_widths[k], \
                label=labels[k], alpha=alpha, \
                lw=0, color=current_palette[k], \
                align='edge')
        # plt.bar(np.append(0,bins_in[k]),np.append(np.append(N[k],0),0), width=bin_widths[k], \
        #         label=labels[k],alpha=.5, \
        #         lw=0,color=current_palette[k], \
        #         align='edge')

        # add a zero to the beginning here so that the line completes down to the bottom
        # and on the other end too.
        plt.step(np.append(bins_to_plot[k],bins_in[k][-1]), np.append(np.append(0, N[k]), 0), \
                 lw=2, color=current_palette[k], \
                 )


        # plt.bar(bins_in[k][:-1], N[k], width=bin_widths[k], alpha=alpha, \
        #         fc=current_palette[k],label=labels[k], \
        #         lw=0,\
        #         align='center')
        #
    # plot the errorbars
    if not noerror:
        for k in range(S):
            plt.errorbar(X_mean[k],N_max*(1+.1*(S-k+1)),xerr=X_std[k],fmt='o',color=current_palette[k],alpha=1,lw=3)


    if not ylabel:
        if normalize_flag:
            if fracylabel_flag:
                ylabel = '% of points'
            else:
                ylabel = 'pdf'
        else:
            ylabel = 'number'
    plt.ylabel(ylabel)

    if multiple_flag:
        plt.legend(loc='upper right')
        plt.legend(framealpha=0.0)

    # plt.autoscale(enable=True, axis='y', tight=True)

    # plot those ***s
    # get those legend colors - and plot them only if
    # for each thingy the N[0] is not zero, on the left, or if the N[-1] is not zero on the right


    if title:
        plt.title(title)
    if xlabel:
        plt.xlabel(xlabel)

    ax = plt.gca()
    if datetime_flag:
        xfmt = md.DateFormatter(date_formatting)
        ax.xaxis.set_major_formatter(xfmt)
        [tick.set_rotation(70) for tick in ax.get_xticklabels()]
        plt.subplots_adjust(bottom=0.3)



    # clean up things a bit
    plt.grid(False)

    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)

    fig = plt.gcf()
    fig.nhist = {'N': N, 'bins': bins_in, 'rawN': rawN}
    return fig





[docs]
def ndhist(x, y=None, log_colorbar_flag=False, maxx=None, maxy=None, minx=None, miny=None,
                                    int_bins_flag=False, int_bins_flagx=False, int_bins_flagy=False,
                                    exclude_extremes=False,
                                    normy=False, normx = False,
                                    fx=1.5, fy=1.5, std_times=4, f=None,
                                    smooth = False,
                                    markertype=None,
                                    normr = False,
                                    colors = 'none',
                                    levels=False,
                                    level_color=None):
    """
    Plot a 2D histogram (heat map) with automatic binning.

    Accepts two arrays, a single array (treated as a time series), or a
    complex-valued array (real/imag mapped to x/y).

    Parameters
    ----------
    x : array-like or complex array
        X values, or y values if *y* is not given, or complex numbers
        where real part is x and imaginary part is y.
    y : array-like, optional
        Y values. Leave blank for time-series mode or complex input.
    log_colorbar_flag : bool, optional
        Use a log scale for the color bar. Default is False.
    maxx, maxy, minx, miny : float, optional
        Axis limits. Default is None (auto).
    int_bins_flag : bool, optional
        Force both x and y bin edges onto integers. Default is False.
    int_bins_flagx, int_bins_flagy : bool, optional
        Force only x or y bins onto integers. Default is False.
    exclude_extremes : bool, optional
        Exclude edge bins that collect data beyond *std_times*.
        Default is False.
    normy : bool, optional
        Normalize colors per y-slice. Default is False.
    normx : bool, optional
        Normalize colors per x-slice. Default is False.
    fx : float, optional
        Bin factor for x-axis. Default is 1.5.
    fy : float, optional
        Bin factor for y-axis. Default is 1.5.
    std_times : float, optional
        Number of standard deviations to display. Default is 4.
    f : float, optional
        Bin factor applied to both axes. Overrides *fx* and *fy*.
    smooth : float, optional
        Gaussian filter sigma in pixels. 0 or False for no smoothing.
        Default is False.
    markertype : str, optional
        Marker style to overlay data points (e.g. ``'.'``).
    levels : bool or list, optional
        If True, draw filled contours. If a list of values, draw contour
        lines at those percentile levels. Default is False.
    level_color : dict, optional
        Dict with ``'level'`` and ``'cmap'`` keys for filled contour
        coloring.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The current figure. Data is attached as ``fig.ndhist``, a dict
        with keys ``'counts'``, ``'bins_x'``, and ``'bins_y'``.

    Examples
    --------
    >>> x = np.random.randn(10000)
    >>> y = x + np.random.randn(10000)
    >>> fig = ndhist(x, y)

    >>> z = (5 + np.random.randn(1000)) * np.exp(1j * np.random.randn(1000))
    >>> fig = ndhist(z, smooth=1)
    """

    # convertes the counts into percentages
    def counts_to_pcnts(counts):
        flat_counts = flatten(counts)
        flat_norms = np.zeros(flat_counts.shape)
        cum_summ = 0
        for w in sorted(np.unique(flat_counts), reverse=True):
            I = (w == flat_counts)
            flat_norms[I] = cum_summ
            cum_summ += w * sum(I)
        flat_norms = 100 * flat_norms / cum_summ
        norms = unflatten(flat_norms, counts)

        return norms

    class nf(float):
        def __repr__(self):
            s = f'{self:.1f}'
            return f'{self:.0f}' if s[-1] == '0' else s


    if colors == 'none':
        if levels:
            if type(levels) == bool:
                colors = plt.cm.Greens
            else:
                colors = 'gray'

        else:
            colors = plt.cm.Greens_r

    if np.array(x).dtype == 'complex128':
        y = x.imag
        x = x.real

    # if you just passed a timeseries, then use the x as an index
    if not hasattr(y, '__len__'):
        y = copy.deepcopy(x)
        x = range(len(y))

    # check for incompatable user params
    if normy & normx:
        raise Warning("you can't normalize on both x and y at the same time, so we just normalized by X")
        normy = False

    if f:
        fx = f
        fy = f

    x = np.array(x)
    y = np.array(y)

    mask = np.isfinite(y + x)
    x = x[mask]
    y = y[mask]

    # if your limits are infinitiy, then choose the max/min of the data
    if minx == -np.inf:
        minx = np.min(x)
    if miny == -np.inf:
        miny = np.min(y)
    if maxx == np.inf:
        maxx = np.max(x)
    if maxy == np.inf:
        maxy = np.max(y)

    # if ylims == 'auto':
    #     cur_std = np.std(y)
    #     cur_mean = np.mean(y)
    #     mask = (y > cur_mean - std_times * cur_std) & (y < cur_mean + std_times * cur_std)
    #     x = x[mask]
    #     y = y[mask]
    #

    #  choose the first result, then the first answer in it


    if int_bins_flag:
        int_bins_flagx = True
        int_bins_flagy = True

    # Better to implement choosebins base to work with one rather than an array
    binsx = choose_bins([x], minx=minx, maxx=maxx, int_bins_flag=int_bins_flagx, bin_factor=fx, exclude_extremes=exclude_extremes, std_times=std_times)[0][0]
    binsy = choose_bins([y], minx=miny, maxx=maxy, int_bins_flag=int_bins_flagy, bin_factor=fy, exclude_extremes=exclude_extremes, std_times=std_times)[0][0]

    counts, bins_x, bins_y = np.histogram2d(x, y, (binsx, binsy))

    if smooth:
        counts = gaussian_filter(counts, smooth)

    to_plot = counts.transpose()
    if log_colorbar_flag:
        # todo: change this to be actually adjusting the colorbar, not the counts
        to_plot = log10(1 + to_plot)
        # maybe this isn't needed? Counts cant be less than 0, so the log10(1) must be zero
        to_plot[to_plot == -np.inf] = -np.max(counts) / len(x)
        if levels:
            #ToDo: Make % bins work with log colorbar, switching back to regular plot
            logging.warning("Your chosen levels are being ignored because of the log_colorbar_flag")
            levels = 'none'

    #     else:
    #         to_plot[to_plot==-np.inf] = -np.max(counts) / len(x)
    # normalize by x or y
    if normx:
        to_plot = to_plot / (eps + np.max(to_plot, axis=0))
    elif normy:
        to_plot = (to_plot.T/(np.max(to_plot, axis=1) + eps)).T

    if normr:
        # R2 = X. ^ 2 + Y. ^ 2; % R. ^ 2
        X, Y = np.meshgrid(binsx, binsy)
        R2 = np.power(X, 2) + np.power(Y, 2)
        R = np.sqrt(R2)# R^2
        to_plot = to_plot * R


    # eventually have this be the default if it can be sped up
    if levels:
        to_plot = counts_to_pcnts_fast(to_plot)
        dsx = np.diff(bins_x)[1]
        dsy = np.diff(bins_y)[1]

        bx = np.append(bins_x[1:-1] - dsx / 2, np.array([bins_x[-2] + dsx / 2]))
        by = np.append(bins_y[1:-1] - dsy / 2, np.array([bins_y[-2] + dsy / 2]))

        if isinstance(levels, bool):

            plt.contourf(bx, by, to_plot, cmap=colors, levels=100)
        else:
            # fig, ax = plt.subplots()
            ax = plt.gca()
            CS = plt.contour(bx, by, to_plot, colors=colors, levels=levels)
            CS.levels = [nf(val) for val in CS.levels]
            ax.clabel(CS, CS.levels, inline=True, fmt='%r %%', fontsize=10, colors = 'k')
            if level_color is not None:
                plt.contourf(bx, by, to_plot, levels=[0, level_color['level']], cmap=level_color['cmap'])

        # fig, ax = plt.subplots()

    else:
        plt.pcolor(bins_x, bins_y, to_plot, cmap=colors)

    plt.xlim(np.array(bins_x)[[1, -2]])
    plt.ylim(np.array(bins_y)[[1, -2]])

    if markertype != None:
        plt.plot(x, y, markertype)

    # plt.axis('tight')
    fig = plt.gcf()
    fig.ndhist = {'counts': counts, 'bins_x': bins_x, 'bins_y': bins_y}
    return fig



# this version of the algorithm is very fast - but will artificially include differences in colors
# could rewrite it somehow to use this and be faster:
# etl.start_and_ends(diff(counts_ordered) < 0)

[docs]
def counts_to_pcnts_fast(counts):
    flat_counts = flatten(counts)
    # sort the counts,            [in reverese]
    idxs = np.argsort(flat_counts)[::-1]
    idxs_undo = np.argsort(idxs)
    # flip from greatest to smallest
    counts_ordered = flat_counts[idxs]
    # this is counting the contents of the bins in order from fullest to least full
    counts_summed = np.cumsum(counts_ordered)
    # convert the counts to a percent of the total N items histogrammed
    flat_norms = 100 * counts_summed / sum(counts_ordered)

    # get unique values of number of elements in each bin
    unique_values = np.unique(counts_ordered)

    # for each unique bin value, avg the pcnts and assign that avg for every index with that value
    for value in unique_values:
        indices_to_avg = np.where(counts_ordered == value)
        avg_pcnt = np.mean(flat_norms[indices_to_avg])
        flat_norms[indices_to_avg] = avg_pcnt

    # reverse the operations to get back to our original orders and shape
    norms_unsorted = flat_norms[idxs_undo]
    norms = unflatten(norms_unsorted, counts)
    return norms



[docs]
def choose_bins(X, min_bins=10, max_bins=175, bin_factor=1.5, sameBinsFlag=False, std_times=4, minx=None, maxx=None,
                int_bins_flag=None, exclude_extremes=True):

    # Uses Scott's normal reference rule to select the number of bins
    # https://en.wikipedia.org/wiki/Histogram#Scott's_normal_reference_rule
    # If multiple sets of data are passed then it makes sensible choices

    S = len(X)

    minS = np.zeros(S)
    maxS = np.zeros(S)

    stdV = [np.std(w) for w in X]
    meanV = [np.mean(w) for w in X]
    for k in range(S):
        Values = X[k]
        # set x MIN values
        very_small_num = 0.000000000000001
        if minx is None: # user did not specify - then we need to find the minimum x value to use
            if stdV[k]>0.000000000000001: # just checking there are more than two different points to the data, checking for rounding errors.
                leftEdge = meanV[k]-stdV[k]*std_times
                if leftEdge<np.min(Values): # if the std is larger than the largest value
                    minS[k]=np.min(Values)
                else: # cropp it now on the bottom.
    #             cropped!
                    minS[k] = leftEdge
            else: # stdV==0, wow, all your data points are equal
                minS[k]=np.min(Values)-0.000000000000001 # padd it by 100, seems reasonable
        else: # minX is specified so minS is just set stupidly here
            if minx < np.max(Values):
                minS[k] = minx
            else: # ooh man, even your biggest value is smaller than your min
                logging.warning(['user parameter minx=' + str(minx) + ' is being overriden since it put all your data out of bounds'])
                minS[k] = np.min(Values)

    # set x MAX values
        if maxx is None:
            if stdV[k]>very_small_num: # just checking there are more than two different points to the data
                rightEdge = meanV[k]+stdV[k]*std_times
                if rightEdge>np.max(Values): # if the suggested border is larger than the largest value
                    maxS[k]=np.max(Values)
                else: # crop the graph to cutoff values
                    maxS[k]=rightEdge
            else: # stdV==0, wow,
    #           Note that minX no longer works in this case.
                maxS[k]=np.max(Values)+very_small_num # padd it by 100, seems reasonable
        else: # maxX is specified so minS is just set here
            maxS[k]=maxx;
            if maxx>np.min(Values):
                maxS[k]=maxx
            else: # ooh man, even your smallest value is bigger than your max
                maxS[k]=np.max(Values)
                # warning(['user parameter maxx=' num2str(maxX) ' override since it put all your data out of bounds']);
    #

    # if intbins(k)
    #     maxS(k)=round(maxS(k))+.5;
    #     minS(k)=round(minS(k))-.5; # subtract 1/2 to make the bin peaks appear on the numbers.
    # end

    # need the isData boolean to so you are only looking at the ones that actually have some data
    x_min = np.min(minS)
    x_max = np.max(maxS)

# % note that later there will be a bit added to maxS of SXRange
# % This below is to get estimates for appropriate binsizes
# totalRange=diff(SXRange); % if the range is zero, then make it eps instead.


    total_range = x_max - x_min

    if int_bins_flag is not None:
        int_bins = [int_bins_flag for x in X]
    else:
        int_bins = [isdiscrete(x) for x in X]

    # scotts choice bin width
    bin_widths = [3.5 * np.std(x) / (bin_factor * np.power(len(x), 1.0 / 3)) for x in X]

  # Instate a mininum and maximum number of bins
    num_bins = [1.0 * total_range/bin_width for bin_width in bin_widths] # Approx number of bins
    num_bins = np.round(num_bins)
    for k in range(S):
        if num_bins[k]<min_bins: # if this will imply less than 10 bins
            num_bins[k] = min_bins
            bin_widths[k] = total_range / (min_bins) # set so there are ten bins

        if num_bins[k]>max_bins: # if there would be more than 175 bins (way too many)
            num_bins[k] = max_bins
            bin_widths[k]=total_range / max_bins

        # This will make the span of bins have edges that go up to and include the extreme-most data
        bin_widths[k] = total_range / num_bins[k] + eps * 100

#   Check if it is intbins, because then:
        if int_bins[k]: # binwidth must be an integer, and it must be at least 1
            bin_widths[k] = np.max([np.round(bin_widths[k]), 1])
            x_min = np.floor(x_min)


##   if there is enough space to plot them, then plot vertical lines.
## 30 bins is arbitrarily chosen to be the number after which there are
## vertical lines plotted by default
    # if n_bins(k)<30:
    #     vertLinesArray(k)=1;
    # else
    #     vertLinesArray(k)=0;
## fix the automatic decision if vertical lines were specified by the user
# if vertLinesForcedFlag
#     vertLinesArray=vertLinesArray*0+vertLinesFlag;

# only plot lines if they all can be plotted.
# also creates one flag, so the 'array' does not need to be used.
# vertLinesFlag=prod(vertLinesArray); # since they are zeros and 1's, this is an "and" statement

# find the maximum bin width
    big_bin_width=np.max(bin_widths)

##  resize bins to be multiples of each other - or equal
# sameBinsFlag will make all bin sizes the same.

    # Note that in all these conditions the largest histogram bin width
    # divides evenly into the smaller bins. This way the data will line up and
    # you can easily visually compare the values in different bins
    if sameBinsFlag: # if 'same' is passed then make them all equal to the average recommended size
        only_bin_width = np.mean(bin_widths)
        if int_bins_flag:
            only_bin_width = np.round(only_bin_width)
        bin_widths=[only_bin_width] * len(bin_widths)
    else: # the bins will be different if neccesary
        for k in range(len(X)):
    #       The 'ceil' rather than 'round' is supposed to make sure that the
    #       ratio is at lease 1 (divisor at least 2).
            bin_widths[k]=big_bin_width/np.ceil(big_bin_width/bin_widths[k])

    # SXRange(2) = SXRange(2)+max(binWidth)
    if exclude_extremes:
        bins = [np.arange(x_min, x_max + big_bin_width, w) for w in bin_widths]
    else:
        bins = [np.concatenate([[-np.inf], np.arange(x_min, x_max + big_bin_width, w), [np.inf]]) for w in bin_widths]

    return bins, bin_widths




[docs]
def dictify_cols2(df):
    return df.groupby(df.columns[0])[df.columns[1]].apply(lambda w: w.values).to_dict()



# This will tell if the data is an integer or not.
# first it will check if the variable is integer, but even so, they
# might be integers stored as floats!

[docs]
def isdiscrete(x,min_error='default'): # L stands for logical
    if min_error=='default':
        min_error=np.spacing(1)*100 # the minimum average difference from integers they can be.

    if isinstance(x,int):# your done, duh, its an int.
        return True
    else:
        return np.sum(np.abs(x-np.round(x)))/len(x)<min_error




[docs]
def test_nhist_list():
    # import numpy as np
    # from viz.histogram_utils import nhist
    y = np.random.randn(100) + 2
    nhist(y)

    A = [np.random.randn(10 ** 5), np.random.randn(10 ** 3) + 1]
    nhist(A, legend = [r'$\mu$=0', r'$\mu$=1'])



[docs]
def test_nhist_dict():
    A = {'mu_is_Zero': np.random.randn(10 ** 5), 'mu_is_Two': np.random.randn(10 ** 3) + 2}
    _ = nhist(A)
    _ = nhist(A, color='viridis')
    _ = nhist(A, f=4)
    _ = nhist(A, same_bins_flag=True)
    _ = nhist(A, noerror=True)


# Use ndhist without specifying an x axis.

[docs]
def test_ndhist_timeseries():
    n = 10000
    y = np.cumsum(np.random.randn(n)) + 15 * np.random.randn(n)
    _ = ndhist(y, fx=5)
    plt.xlabel('sample number')


# This one looks kind of like a black hole

[docs]
def test_ndhist_complex():
    n = 10000
    z = (5 + np.random.randn(n)) * np.exp(1j * (np.random.randn(n) + np.pi / 4))
    _ = ndhist(z, smooth=1)
    plt.colorbar()


# use the log colorbar to manage varying

[docs]
def test_ndhist():
    x = np.concatenate([np.ones(500), np.random.randn(100000), 4 + np.random.randn(1000) / 2])
    y = np.concatenate([np.ones(500), np.random.randn(100000), 3 + np.random.randn(1000) / 2])
    counts, bins_x, bins_y = ndhist(x, y, log_colorbar_flag=True)




if __name__=='main':
    pass