Source code for bedPyLoad.mor_fun

"""
Script provides functions for application at all levels, for instance, to plot data.
more_fun is an acronym for 'morpho-analyst functions' or 'more fun', depending on your preference
"""
try:
    import os.path
    import pandas
    from .config import *
except Exception as e:
    print('Import incomplete - errors likely:\n' + str(e))



[docs]
def annotated_plot(
        df,
        target_var,
        num_var,
        x_label=None,
        y_label=None,
        plot_type='boxplot',
        fig_format='png',
        fig_path=None,
        color_palette=None,
        dpi=300,
        bbox='tight',
        test='Kruskal',
        text_format='simple',
        text_offset=7,
        y_scale=None
):
    """
    Make an annotated plot with statannotations.stats. Read more about statannoation usage at
    https://github.com/trevismd/statannotations/tree/master/usage

    :param pd.DataFrame df: DataFrame containing categorical and numerical data to be boxplotted.
                            Categories will occur on the x-axis according to target_var.
                            Numerical data according tu num_var
    :param str target_var: name of a target variable that must be contained in the column names of df
    :param str num_var: name of a numerical variable on the Y-axis that must be contained in the column names of df
    :param str x_label: if provided, this string replaces the column name of target_var (categorical)
    :param str y_label: if provided, this string replaces the column name of the numeric y variable
    :param str plot_type: default is 'boxplot', options are 'violinplot', 'swarmplot'
    :param str fig_format: file ending of image file; default is 'png'
    :param str fig_path: name and directory of image (figure) to save WITHOUT FILE FORMAT ending, MUST end on '/'
    :param str,list,dict color_palette: colors to be used with the `hue` variable
    :param int dpi: dots per inch for figure (default is 300)
    :param str bbox: default 'tight' applies narrow figure margins
    :param str test: type of statistical test for calculating p-values. Default is 'Kruskal'. Options are defined in
                        statannotations.stats.StatTest.STATTEST_LIBRARY (line 88ff)
    :param str text_format: formatting of p-value annotations. Default is 'simple'. Options are 'star' and 'full'
    :param int text_offset: number of pixels for offset of p-value annotations. Default is 5.
    :param str y_scale: default is None but can be set to 'log' for logarithmic y axis.
    :return int: 0 = success, -1 = error occurred
    """

    configuration = {
        'test': test,
        'text_format': text_format,
        'text_offset': text_offset,
    }

    # treat original dataframe (remove nan and potentially replace labels
    df = df[[target_var, num_var]].copy()
    df = df.dropna().reset_index(drop=True)
    logging.info('* created the following dataframe for test: ' + str(test))
    logging.info(df.head(3))
    cat_key = target_var
    if x_label:
        df = df.rename(columns={target_var: x_label})  # must not use inplace=True
        target_var = x_label
    if y_label:
        df = df.rename(columns={num_var: y_label})
        num_var = y_label

    # setup figure
    col_categories = np.unique(df[target_var]).tolist()  # must be a list
    if SUB_CATEGORY_ORDER[cat_key]:
        categories = SUB_CATEGORY_ORDER[cat_key]
        for cat in categories:
            if not(cat in col_categories):
                categories.remove(cat)
    else:
        categories = col_categories
    fig_args = {
        'x': target_var,
        'y': num_var,
        # 'hue': target_var,
        'data': df,
        'order': categories,
        'palette': color_palette,
        'boxprops': {'facecolor': (.3, .3, .3, .07)},
        'flierprops': {'marker': 'o', 'markerfacecolor': 'gray', 'markersize': 5,
                       'markeredgecolor': 'black', 'alpha': .35},
        'medianprops': {'color': 'gray'},
        # 'hue_order': categories,
        'dodge': True,
    }

    # prepare plot
    fig, ax = plt.subplots(1, 1, figsize=(12, 6))
    # setup fonts
    sns.set_context("paper", font_scale=1.5)
    # apply scaling if defined
    if y_scale:
        logging.info('* using logarithmic y-axis')
        ax.set_yscale('log')
    # make plot
    logging.info('* making base plot...')
    plot_type_dict = {
        'boxplot': sns.boxplot,
        'violinplot': sns.violinplot,
        'swarmplot': sns.swarmplot,
    }
    plot_type_dict[plot_type](ax=ax, **fig_args)

    # define pairs for calculating p-values
    if not SUB_CATEGORY_ORDER[cat_key]:
        # if categories are not defined neighbors as per config.py (i.e. False)
        logging.info('* pairing all categories...')
        pairs = [comb for comb in combinations(categories, 2)]  # last 2 sets pair-wise (not 3-3 cats or similar)
    else:
        # this applies only pairing neighboring categories (e.g. good for seasons such as winter and spring)
        logging.info('* pairing neighboring categories only...')
        rotated_cats = categories[1:] + categories[:1]
        pairs = [(cat, rotated_cats[idx - 1]) for idx, cat in enumerate(rotated_cats)]
    logging.info('* using the following pairs: ' + str(pairs))

    # instantiate Annotator
    logging.info('* annotating figure...')
    try:
        annotator = Annotator(
            ax=ax, pairs=pairs, **fig_args
        )
    except Exception as e:
        logging.error('ERROR: could not plot {0}:\n {1}'.format(str(target_var), str(e)))
        return -1
    annotator.new_plot(ax, plot=plot_type, **fig_args)
    logging.info('* applying {0} test to calculate p-values...'.format(str(test)))
    try:
        if len(pairs) < 13:
            annotator.configure(**configuration).apply_test().annotate()
        else:
            logging.warning('* SKIPPING ANNOTATION (%s are too many pairs for visualization)' % str(len(pairs)))
    except Exception as e:
        logging.error(e)
        logging.error('*** looks like there are NoneTypes in the data set...')
        return -1
    plt.xticks(rotation=45)
    ax.set(xlabel=None)  # remove redundant x-label information
    # write figure to disk
    try:
        fig_name = str(fig_path) + str(cat_key) + '.' + str(fig_format)
        logging.info('  -- trying to save ' + fig_name)
        fig.savefig(
            fig_name,
            format=fig_format,
            dpi=dpi,
            bbox_inches=bbox
        )
        plt.close()
        logging.info('* saved ' + fig_name)
    except Exception as e:
        logging.error('ERROR WHILE SAVING FIGURE:')
        logging.error(e)
        return -1
    return 0




[docs]
def stats_test(
        dataframe: pandas.DataFrame,
        numeric_var_name: str,
        target_columns: list,
        numeric_var_as_categories_name: str = None,
        stats_results_xlsx: str = 'stats-results.xlsx',
        figure_path: str = 'fitting-results/figures/'
):
    """
    Runs Dunn posthoc test on categories with reference to a non-normally distributed variable defined as
    `numeric_var_name`. This function is tweaked for this package and requires the global variables defined in config.py.


    :param dataframe: A pandas dataframe containing all numeric and categorical data
    :param numeric_var_name: Name of a numerical variable (typical response variable) to be tested. MUST be a column
                             name of `dataframe`
    :param target_columns: List of column names to be tested for differences with the numeric variable
    :param numeric_var_as_categories_name: For the Dunn test, the numerical variable should also be categorized
                                           (e.g. in categories 'low', 'average', 'high'). This argument is the name of a
                                           column in `dataframe` that contains the numerical variable as categories.
    :param stats_results_xlsx: Name of an xlsx file to store Dunn test results (default name applies if not provided)
    :param figure_path: directory or subdirectory where figures will be stored; MUST end on '/'
    :return int success: successful execution when 0, otherwise -1
    """

    for target_var in target_columns:
        if numeric_var_as_categories_name == target_var:
            # do not correlate the numeric version of a variable with its categorization
            continue
        logging.info('> Pre-processing for Dunn tests...' + target_var)
        # run Dunn tests and write results to XLSX files
        df_cat4sub = pd.concat(
            [dataframe[numeric_var_as_categories_name], dataframe[target_var]],
            axis=1
        ).reset_index().dropna(axis=0)

        logging.info('  ...with %s samples.' % str(df_cat4sub.size))
        if numeric_var_as_categories_name:
            logging.info('> Dunn test with categorical data driving categorized numerical variable...')
            try:
                num_as_cat_df = sp.posthoc_dunn(df_cat4sub, val_col=target_var, group_col=numeric_var_as_categories_name, p_adjust='holm')
                append_df_to_excel(stats_results_xlsx, sheet_name=target_var, df=num_as_cat_df, startrow=0)
            except Exception as e:
                logging.warning('WARNING: NOT ENOUGH SAMPLES - SKIPPING DUNN TEST DIRECTION\n' + str(e))
            logging.info('> Dunn test for categorical data as a function of categorized numerical variable...')
            try:
                cat_from_num_df = sp.posthoc_dunn(df_cat4sub, val_col=numeric_var_as_categories_name, group_col=target_var, p_adjust='holm')
                append_df_to_excel(stats_results_xlsx, sheet_name=target_var, df=cat_from_num_df, startrow=9)
            except Exception as e:
                logging.warning('WARNING: NOT ENOUGH SAMPLES - SKIPPING DUNN TEST DIRECTION\n' + str(e))
        else:
            logging.warning('> Skipping Dunn test because I do not have information on categorization of the numerical parameter')

        logging.info('> Jumping into annotated plotting...')
        if not os.path.isdir(figure_path):
            os.makedirs(figure_path)
        try:
            annotated_plot(
                dataframe,
                target_var,
                num_var=numeric_var_name,
                x_label=FULL_LABEL_DICT[target_var],
                y_label=FULL_LABEL_DICT[numeric_var_name],
                fig_path=figure_path,
                dpi=500,
            )
        except Exception as e:
            logging.warning('WARNING: NOT ENOUGH SAMPLES - SKIPPING ANNOTATED PLOT\n' + str(e))
        logging.info('> STATS TEST FINISHED FOR ' + numeric_var_name.upper())
    return 0



def log_actions(func):
    logging.basicConfig(
        filename='logfile.log',
        level=logging.INFO,
        format='%(asctime)s [%(levelname)s]:%(message)s'
    )

    def wrapper(*args, **kwargs):
        func(*args, **kwargs)
        for handler in logging.getLogger('logfile').handlers:
            handler.close()
            logging.getLogger('logfile').removeHandler(handler)
        for handler in logging.getLogger('warnings').handlers:
            handler.close()
            logging.getLogger('warnings').removeHandler(handler)
        for handler in logging.getLogger('errors').handlers:
            handler.close()
            logging.getLogger('errors').removeHandler(handler)
        print('Check the logfiles: logfile.log, warnings.log, and errors.log.')
    return wrapper



[docs]
def plot_df_completeness(
        df,
        figure_base_name='base',
        replace_col_names=None
):
    """ Uses missingno package to create a plot of dataframe completeness

    :param pandas.DataFrame df: Dataframe to be plotted
    :param str figure_base_name: syllable to be used with figure names
    :param dict replace_col_names: optional argument to overwrite column names
    :return: write plot
    """

    if replace_col_names:
        try:
            df = df.rename(columns=replace_col_names)
        except Exception as e:
            logging.error("ERROR: The provided argument replace_col_names does not fit the dataframe:\n" + str(e))
            return -1

    fig = msno.matrix(df, labels=True)
    fig_copy = fig.get_figure()
    fig_copy.savefig(figure_base_name+'-completeness.png', bbox_inches='tight', dpi=300)

    plt.close()
    logging.info("successfully saved dataframe completeness plot as " + figure_base_name)




[docs]
def get_color_list(n, name='hsv'):
    """ Returns a list of n RGB colors

    :param n: size of colormap list
    :param str name: type of color map - must be a standard matplotlib colormap name
    :return list: colormap of size n
    """
    cmap = plt.cm.get_cmap(name, n)
    return [cmap(c) for c in range(n)]




[docs]
def plot_df_correlations(
        df,
        figure_base_name='base',
        fontsize=16,
        replace_col_names=None
):
    """ Creates a heatmap plot of correlations

    :param pandas.DataFrame df: Dataframe to be plotted
    :param str figure_base_name: syllable to be used with figure names
    :param int fontsize: font size
    :param dict replace_col_names: optional argument to overwrite column names
    :return: write plot
    """

    if replace_col_names:
        try:
            df = df.rename(columns=replace_col_names)
        except Exception as e:
            logging.error("ERROR: The provided argument replace_col_names does not fit the dataframe:\n" + str(e))
            return -1

    plt.figure(figsize=(20, 12))
    ax0 = plt.gca()

    corr_mat = df.corr(method='spearman')
    mask = np.zeros_like(corr_mat)
    mask[np.triu_indices_from(mask)] = True

    sns.heatmap(corr_mat, mask=mask, cmap='RdBu', ax=ax0, cbar=False,
                annot=True, annot_kws={'size': fontsize},
                vmin=-1, vmax=1)

    # visual corrections and modifications
    ax0.xaxis.tick_bottom()
    ax0.set_xticklabels(
        ax0.xaxis.get_majorticklabels(), rotation=45, ha='right', fontsize=fontsize
    )
    ax0.set_yticklabels(ax0.yaxis.get_majorticklabels(), rotation=0, fontsize=fontsize)
    ax0.xaxis.set_ticks_position('none')
    ax0.yaxis.set_ticks_position('none')
    ax0.patch.set_visible(False)

    for text in ax0.texts:
        t = float(text.get_text())
        if 0.95 <= t < 1:
            text.set_text('<1')
        elif -1 < t <= -0.95:
            text.set_text('>-1')
        elif t == 1:
            text.set_text('1')
        elif t == -1:
            text.set_text('-1')
        else:
            text.set_text(round(t, 2))

    ax0.get_figure().savefig(figure_base_name + '-correlations.png', bbox_inches='tight', dpi=400)
    plt.close()
    logging.info("successfully saved dataframe correlation plot as " + figure_base_name)