Source code for cellmaps_generate_hierarchy.maturehierarchy


import os
from datetime import date
import pandas as pd

import networkx as nx
import logging
import cellmaps_generate_hierarchy
from cellmaps_utils import constants
from cellmaps_utils.provenance import ProvenanceUtil

logger = logging.getLogger(__name__)



[docs]
class HiDeFHierarchyRefiner(object):
    """
    Refines HiDeF hierarchy output by removing highly similar terms.
    This code derived from maturehierarchy.py developed by (## todo Leah who created this?)

    """

    TERMS_COL = 'terms'
    TSIZE_COL = 'tsize'
    GENES_COL = 'genes'
    STABILITY_COL = 'stability'

    TYPE_COL = 'type'
    PARENT_COL = 'parent'
    CHILD_COL = 'child'

    GENE_TYPE = 'gene'
    DEFAULT_TYPE = 'default'

    NODES_SUFFIX = '.nodes'
    EDGES_SUFFIX = '.edges'

    CONTAINMENT_THRESHOLD = 0.75
    JACCARD_THRESHOLD = 0.9
    MIN_DIFF = 1
    MIN_SYSTEM_SIZE = 4

    def __init__(self,
                 ci_thre=CONTAINMENT_THRESHOLD,
                 ji_thre=JACCARD_THRESHOLD,
                 min_term_size=MIN_SYSTEM_SIZE,
                 min_diff=MIN_DIFF,
                 provenance_utils=ProvenanceUtil(),
                 author='cellmaps_generate_hierarchy',
                 version=cellmaps_generate_hierarchy.__version__):
        """
        Constructor

        :param ci_thre: Containment index threshold
        :param ji_thre: Jaccard index threshold for merging similar clusters
        :param min_system_size: Minimum number of proteins requiring each system to have
        :param min_diff: Minimum difference in number of proteins for every parent-child pair
        """
        self._ci_thre = ci_thre
        self._ji_thre = ji_thre
        self._min_term_size = min_term_size
        self._min_diff = min_diff
        self._provenance_utils = provenance_utils
        self._author = author
        self._version = version

    @staticmethod
    def _get_node_table_from_hidef(nodes_file):
        """
        Takes a HiDeF nodes file and creates
        a :py:class:`pandas.DataFrame`

        **Example HiDeF edges file:**

        .. code-block::

            Cluster1-408    4       1528 2310 2666 4273     10
            Cluster1-409    4       126 3366 3794 446       10
            Cluster1-410    4       1522 2733 4334 4542     10

        **Resulting DataFrame:**

        .. code-block::

            terms           tsize   genes                   stability
            Cluster1-408    4       1528 2310 2666 4273     10
            Cluster1-409    4       126 3366 3794 446       10
            Cluster1-410    4       1522 2733 4334 4542     10

        :param nodes_file:
        :type nodes_file: str
        :return: DataFrame of the HiDeF nodes file
        :rtype: :py:class:`pandas.DataFrame`
        """
        node_table = pd.read_csv(nodes_file, header=None, sep='\t')
        logger.debug('nodes table head: ' + str(node_table.head()))
        logger.debug('Size of node table: ' + str(len(node_table)))

        node_table.columns = [HiDeFHierarchyRefiner.TERMS_COL,
                              HiDeFHierarchyRefiner.TSIZE_COL,
                              HiDeFHierarchyRefiner.GENES_COL,
                              HiDeFHierarchyRefiner.STABILITY_COL]
        return node_table

    @staticmethod
    def _get_edge_table_from_hidef(edges_file):
        """
        Takes a HiDeF edges file and creates
        a :py:class:`pandas.DataFrame`

        **Example HiDeF edges file:**

        .. code-block::

            Cluster0-0      Cluster1-0      default
            Cluster0-0      Cluster1-1      default
            Cluster0-0      Cluster1-2      default

        **Resulting DataFrame:**

        .. code-block::

            parent          child           type
            Cluster0-0      Cluster1-0      default
            Cluster0-0      Cluster1-1      default
            Cluster0-0      Cluster1-2      default


        :param edges_file:
        :type edges_file: str
        :return:
        :rtype: :py:class:`pandas.DataFrame`
        """
        edge_table = pd.read_csv(edges_file, header=None, sep='\t')
        edge_table.columns = [HiDeFHierarchyRefiner.PARENT_COL,
                              HiDeFHierarchyRefiner.CHILD_COL,
                              HiDeFHierarchyRefiner.TYPE_COL]
        return edge_table

    @staticmethod
    def _get_node_table_filtered_by_term_size(node_table, min_term_size=4):
        """
        Gets a new :py:class:`pandas.DataFrame` derived from
        **node_table** that only contains rows whose :py:const:`TSIZE_COL`
        is at least **min_term_size**

        :param min_term_size: Minimum size of term (ie number of genes)
        :type min_term_size: int
        :return: Filtered **node_table**
        :rtype: :py:class:`pandas.DataFrame`
        """
        filtered_table = node_table[node_table[HiDeFHierarchyRefiner.TSIZE_COL] >= min_term_size]
        logger.debug('Size node node table after term size filter: ' + str(len(filtered_table)))
        return filtered_table

    @staticmethod
    def _get_edge_table_filtered_by_node_set(edge_table, node_set=None):
        """
        Gets a :py:class:`pandas.DataFrame` derived from **edge_table** that only
        contains rows whose parent and child both exist in **node_set** passed in

        :param edge_table: table with :py:const:`PARENT_COL` and :py:const:`CHILD_COL`
                           columns
        :type edge_table: :py:class:`pandas.DataFrame`
        :param node_set: Names of clusters to keep
        :type node_set: set or list
        :return: edge table with rows whose parent and child both exist in **node_set** passed in
        :rtype: :py:class:`pandas.DataFrame`
        """
        return edge_table[edge_table[[HiDeFHierarchyRefiner.PARENT_COL,
                                      HiDeFHierarchyRefiner.CHILD_COL]].isin(list(node_set)).all(axis=1)]

    @staticmethod
    def _get_leaves_from_edge_table(edge_table=None):
        """
        Gets a set of term names found in :py:const:`CHILD_COL` column values
        that are **NOT** in :py:const:`PARENT_COL` column values

        :param edge_table:
        :type edge_table: :py:class:`pandas.DataFrame`
        :return: leave terms
        :rtype: set
        """
        return (set(edge_table[HiDeFHierarchyRefiner.CHILD_COL].unique())) - set(edge_table[HiDeFHierarchyRefiner.PARENT_COL].unique())

    def _get_leaf_genes_to_add(self, node_table=None, leaves=None):
        """
        For each term in **leaves** list get genes for that term and
        add a list with these elements ``[TERM, GENE NAME, GENE_TYPE]``
        to the list returned

        :param node_table:
        :param leaves: term name
        :type leaves: list
        :return: list of ``[TERM, GENE NAME, GENE_TYPE]`` for all the genes found
                in the terms in **leaves** list
        :rtype: list
        """
        add_gene = []
        for leaf in leaves:
            genes = self._get_genes_from_node_table_for_term(node_table, leaf)
            for gene in genes:
                add_gene.append([leaf, gene, HiDeFHierarchyRefiner.GENE_TYPE])
        return add_gene

    def _get_nonleaf_genes_to_add(self, node_table=None,
                                  edge_table=None):
        """
        # TODO: Test this
        Returns a list of ``[TERM, GENE NAME, GENE_TYPE]`` that
        correspond to genes **NOT** passed to leaf nodes

        :param node_table: HiDeF nodes table
        :type node_table: :py:class:`pandas.DataFrame`
        :param edge_table: HiDeF edge table
        :type edge_table: :py:class:`pandas.DataFrame`
        :return: list of ``[TERM, GENE NAME, GENE_TYPE]`` for all the genes found
                 in non-leaf nodes
        :rtype: list
        """
        gene_rows_to_add = []
        parent_to_child = edge_table.groupby(HiDeFHierarchyRefiner.PARENT_COL)[HiDeFHierarchyRefiner.CHILD_COL].apply(list)  # group the parents
        for parent, children in parent_to_child.items():
            parent_genes = self._get_genes_from_node_table_for_term(node_table, parent)
            child_genes = []
            for child in children:
                genes = self._get_genes_from_node_table_for_term(node_table, child)
                child_genes = set(child_genes).union(set(genes))
            # genes only in parent did not pass to child
            only_parent = set(parent_genes) - set(child_genes)
            if len(only_parent) >= 1:
                for gene in only_parent:
                    gene_rows_to_add.append([parent, gene, HiDeFHierarchyRefiner.GENE_TYPE])
        return gene_rows_to_add

    def _create_ontology(self, node_table=None, edge_table=None, min_term_size=4):
        """
        # TODO: Test this
        Creates ontology from HiDeF nodes and edges file

        :param path: Prefix path to nodes & edges files generated by HiDeF,
                     .nodes and .edges will be appended to this path
        :type path: str
        :param min_term_size:
        :type min_term_size: int
        :return:
        :rtype: py:class:`pandas.DataFrame`
        """
        # filter out rows below minTermSize
        node_table_filtered = self._get_node_table_filtered_by_term_size(node_table,
                                                                         min_term_size=min_term_size)

        logger.info(str(len(node_table_filtered)))
        node_set = set(node_table_filtered[HiDeFHierarchyRefiner.TERMS_COL])  # get the set of nodes

        # keep only edges that have entries in node_set
        edge_table_filtered = self._get_edge_table_filtered_by_node_set(edge_table, node_set=node_set)

        # Find leaves and get their genes
        leaves = self._get_leaves_from_edge_table(edge_table=edge_table_filtered)
        gene_rows_to_add = self._get_leaf_genes_to_add(node_table=node_table_filtered, leaves=leaves)

        # Get non leaf genes
        gene_rows_to_add.extend(self._get_nonleaf_genes_to_add(node_table=node_table_filtered,
                                                               edge_table=edge_table_filtered))

        # add gene rows to new dataframe and append to final dataframe
        add_rows = pd.DataFrame(gene_rows_to_add, columns=[HiDeFHierarchyRefiner.PARENT_COL,
                                                           HiDeFHierarchyRefiner.CHILD_COL,
                                                           HiDeFHierarchyRefiner.TYPE_COL])
        final_df = pd.concat([edge_table_filtered, add_rows])
        logger.debug(' Size of final table: ' + str(len(final_df)))
        return final_df

    @staticmethod
    def _get_genes_from_node_table_for_term(nodes_table, term):
        """

        Iterates over genes found under the :py:class:`GENES_COL` column
        splitting by space to get a gene list

        It is assumed the **nodes_table** has this format:

        .. code-block::

            terms           tsize   genes                   stability
            Cluster1-408    4       1528 2310 2666 4273     10
            Cluster1-409    4       126 3366 3794 446       10
            Cluster1-410    4       1522 2733 4334 4542     10


        :param nodes_table: Nodes table from HiDeF output
        :type nodes_table: :py:class:`pandas.DataFrame`
        :param term: term name used to find row to get genes from
        :type term: str
        :return: genes
        :rtype: list
        """
        genes = nodes_table.loc[nodes_table.terms == term][HiDeFHierarchyRefiner.GENES_COL]
        gene_list = []
        for g in genes:
            gene_list.extend(g.split(' '))
        return gene_list

    @staticmethod
    def _to_pandas_dataframe(nx_graph):
        """
        # TODO: Finish implementing this
        Converts **nx_graph** to a Pandas DataFrame
        with columns set to ``source, target, type``

        :param nx_graph:
        :return:
        :rtype: :py:class:`pandas.DataFrame`
        """
        e = nx_graph.edges(data=True)
        df = pd.DataFrame()
        df['source'] = [x[0] for x in e]
        df['target'] = [x[1] for x in e]
        df['type'] = [x[2]['type'] for x in e]
        return df

    @staticmethod
    def _get_term_stats(nx_graph, hiergeneset):
        # TODO: Finish implementing this
        clusters = list(set(list(nx_graph.nodes())) - hiergeneset)
        tsize_list = []
        cgene_list = []
        descendent_list = []
        for c in clusters:
            infoset = nx.descendants(nx_graph, c)
            cgeneset = infoset.intersection(hiergeneset)
            tsize_list.append(len(cgeneset))
            cgene_list.append(list(cgeneset))
            descendent_list.append(list(infoset - cgeneset))
        df = pd.DataFrame(index=clusters)
        df['tsize'] = tsize_list
        df['genes'] = cgene_list
        df['descendent'] = descendent_list
        return df

    @staticmethod
    def _jaccard(a, b):
        """
        Calculates Jaccard index of **a** and **b**

        :param a: set a
        :type a: list or set
        :param b: set b
        :type b: list or set
        :return: Jaccard index
        :rtype: float
        """
        if type(a) != set:
            a = set(a)
        if type(b) != set:
            b = set(b)
        return len(a.intersection(b)) / len(a.union(b))

    def _clean_shortcut(self, nx_graph):
        # TODO: Finish implementing this
        edge_df = self._to_pandas_dataframe(nx_graph)
        edge_df.columns = [HiDeFHierarchyRefiner.PARENT_COL,
                           HiDeFHierarchyRefiner.CHILD_COL,
                           HiDeFHierarchyRefiner.TYPE_COL]
        for idx, row in edge_df.iterrows():
            if len(list(nx.all_simple_paths(nx_graph, row[HiDeFHierarchyRefiner.PARENT_COL],
                                            row[HiDeFHierarchyRefiner.CHILD_COL]))) > 1:
                nx_graph.remove_edge(row[HiDeFHierarchyRefiner.PARENT_COL],
                                     row[HiDeFHierarchyRefiner.CHILD_COL])
                logger.debug('shortcut edges is removed between {} and {}'.format(row[HiDeFHierarchyRefiner.PARENT_COL],
                                                                                  row[HiDeFHierarchyRefiner.CHILD_COL]))

    def _reorganize(self, nx_graph, hiergeneset, ci_thre): # Add an edge if the nodes have containment index >=threshold
        # TODO: Finish implementing this
        iterate = True
        n_iter = 1
        while iterate:
            clear = True
            logger.debug('... starting iteration ' + str(n_iter))
            ts_df = self._get_term_stats(nx_graph, hiergeneset) # get the termStats from the networkx
            ts_df.sort_values('tsize', ascending=False, inplace=True)
            for comp, row in ts_df.iterrows():
                tmp = ts_df[ts_df['tsize'] < row['tsize']] # get all components smaller than this components
                if tmp.shape[0] == 0:
                    continue
                comp_geneset = set(row['genes']) # get the set of genes
                descendent = row['descendent'] # get the list of descendent nodes
                for tmp_comp, tmp_row in tmp.iterrows():
                    if tmp_comp in descendent: # skip if already in descendent
                        continue
                    tmp_comp_geneset = set(tmp_row['genes'])
                    # Check if satisfy ci_thre
                    # intersection of two components divided by the term size of the smaller component
                    if len(comp_geneset.intersection(tmp_comp_geneset))/tmp_row['tsize'] >= ci_thre:
                        # Check if child having higher weight than parent
                        logger.debug('{} is contained in {} with a CI bigger than threshold, add edge between'.format(tmp_comp, comp))
                        nx_graph.add_edge(comp, tmp_comp, type='default')
                        clear = False
                        descendent += tmp_row['descendent']
            # Further clean up using networkx to remove shortcut edges
            self._clean_shortcut(nx_graph)
            # Update variables
            n_iter += 1
            if clear:
                iterate = False
        if n_iter == 2:
            modified = False
        else:
            modified = True
        return modified

    def _merge_parent_child(self, nx_graph, hiergeneset, ji_thre):
        # TODO: Finish implementing this
        # Delete child term if highly similar with parent term
        # One parent-child relationship at a time to avoid complicacies involved in potential long tail
        logger.debug('... start removing highly similar parent-child relationship')
        similar = True
        merged = False
        while similar:
            clear = True
            edge_df = self._to_pandas_dataframe(nx_graph)
            ts_df = self._get_term_stats(nx_graph, hiergeneset)
            default_edge = edge_df[edge_df['type'] == 'default'] # edges
            for idx, row in default_edge.iterrows():
                if self._jaccard(ts_df.loc[row['source']]['genes'], ts_df.loc[row['target']]['genes']) >= ji_thre:
                    logger.debug('# Cluster pair {}->{} failed Jaccard, removing cluster {}'.format(row['source'], row['target'],
                                                                                             row['target']))
                    clear = False
                    merged = True
                    parents = edge_df[edge_df['target'] == row['target']]['source'].values
                    children = edge_df[edge_df['source'] == row['target']]['target'].values
                    # Remove all parent->node edges
                    for pnode in parents:
                        nx_graph.remove_edge(pnode, row['target'])
                    for child_node in children:
                        etype = nx_graph[row['target']][child_node]['type']
                        # Remove all node->child edges
                        nx_graph.remove_edge(row['target'], child_node)
                        # Add all parent->child edges
                        for pnode in parents:
                            nx_graph.add_edge(pnode, child_node, type=etype)
                    # Remove target node
                    nx_graph.remove_node(row['target'])
                    break
            if clear:
                similar = False
        # Clean up shortcuts introduced during node deleteing process
        self._clean_shortcut(nx_graph)
        return merged

    def _collapse_redundant(self, nx_graph, hiergeneset, min_diff):
        # TODO: Finish implementing this
        # Delete child term if highly similar with parent term
        # One parent-child relationship at a time to avoid complicacies involved in potential long tail
        logger.debug('... start removing highly redundant systems')
        while True:
            edge_df = self._to_pandas_dataframe(nx_graph)
            ts_df = self._get_term_stats(nx_graph, hiergeneset)
            default_edge = edge_df[edge_df['type'] == 'default']
            to_collapse = []
            for idx, row in default_edge.iterrows():
                parentSys, childSys, _ = row.values
                if ts_df.loc[parentSys]['tsize'] - ts_df.loc[childSys]['tsize'] < min_diff:
                    to_collapse.append([parentSys, childSys])
            if len(to_collapse) == 0:
                logger.debug('nothing to collapse')
                return
            to_collapse = pd.DataFrame(to_collapse, columns=['parent', 'child'])
            deleteSys = to_collapse.loc['child']
            logger.debug('# Cluster pair {}->{} highly redundant, removing cluster {}'.format(to_collapse.loc['parent'],
                                                                                       to_collapse.loc['child'],
                                                                                       deleteSys))
            parents = edge_df[edge_df['target'] == deleteSys]['source'].values
            children = edge_df[edge_df['source'] == deleteSys]['target'].values
            # Remove all parent->node edges
            for pnode in parents:
                nx_graph.remove_edge(pnode, deleteSys)
            for child_node in children:
                etype = nx_graph[deleteSys][child_node]['type']
                # Remove all node->child edges
                nx_graph.remove_edge(deleteSys, child_node)
                # Add all parent->child edges
                for pnode in parents:
                    nx_graph.add_edge(pnode, child_node, type=etype)
            # Remove target node
            nx_graph.remove_node(deleteSys)

    def _register_pruned_hidef_output_files(self, outprefix):
        """
        Register <outprefix>.nodes and <outprefix.edges> pruned
        HiDeF output files with FAIRSCAPE

        :param outprefix:
        :type outprefix: str
        :return: dataset ids
        :rtype: list
        """
        d_sets = []
        for hidef_file in ['nodes', 'edges']:
            outfile = outprefix + '.' + hidef_file
            data_dict = {'name': os.path.basename(outfile) +
                         ' Pruned HiDeF output ' + hidef_file[0] + ' file',
                         'description': ' Pruned HiDeF output ' + hidef_file[0] + ' file',
                         'data-format': 'tsv',
                         'author': str(self._author),
                         'version': str(self._version),
                         'date-published': date.today().strftime('%m-%d-%Y')}
            d_sets.append(self._provenance_utils.register_dataset(os.path.dirname(outfile),
                                                                  source_file=outfile,
                                                                  data_dict=data_dict))
        return d_sets


[docs]
    def refine_hierarchy(self, outprefix=None):
        """
        Removes highly similar systems and dumps out a new HiDeF formatted
        .nodes and .edges file with .pruned.nodes and .pruned.edges suffixes

        :param outprefix: output_dir/file_prefix for the output file
        :type outprefix: str
        :return: dataset ids of .pruned.nodes and .pruned.edges file generated
        :rtype: list
        """
        logger.debug('Containment index threshold: ' + str(self._ci_thre))
        logger.debug('Jaccard index threshold: ' + str(self._ji_thre))

        # Read node table
        node_table = self._get_node_table_from_hidef(outprefix +
                                                     HiDeFHierarchyRefiner.NODES_SUFFIX)

        # Read edge table
        edge_table = self._get_edge_table_from_hidef(outprefix + HiDeFHierarchyRefiner.EDGES_SUFFIX)

        ont = self._create_ontology(node_table=node_table, edge_table=edge_table,
                                    min_term_size=self._min_term_size)
        hiergeneset = set(ont[ont[HiDeFHierarchyRefiner.TYPE_COL] == HiDeFHierarchyRefiner.GENE_TYPE][HiDeFHierarchyRefiner.CHILD_COL].values)

        nx_graph = nx.from_pandas_edgelist(ont,
                                           source=HiDeFHierarchyRefiner.PARENT_COL,
                                           target=HiDeFHierarchyRefiner.CHILD_COL,
                                           edge_attr=HiDeFHierarchyRefiner.TYPE_COL,
                                           create_using=nx.DiGraph())

        if not nx.is_directed_acyclic_graph(nx_graph):
            raise ValueError('Input hierarchy is not DAG!')

        while True:
            modified = self._reorganize(nx_graph, hiergeneset, self._ci_thre)
            merged = self._merge_parent_child(nx_graph, hiergeneset, self._ji_thre)
            if not modified and not merged:
                break

        self._collapse_redundant(nx_graph, hiergeneset, self._min_diff)
        # Output as ddot edge file
        self._clean_shortcut(nx_graph)

        edge_df = self._to_pandas_dataframe(nx_graph)

        # we need to recreate .nodes file in HiDeF format
        # so create nodes dataframe which has this mapping of
        # all genes for a given cluster (including genes attached to children)
        nodes = self._get_term_stats(nx_graph, hiergeneset)
        logger.debug(nodes.head())

        # get rid of the descendent column
        nodes.drop(['descendent'], axis=1, inplace=True)

        # the genes are in a list, but we need them space delimited so
        # in place replace the 'genes' column
        cleaned = []
        for i, rows in nodes.iterrows():
            if isinstance(rows['genes'], list):
                genes = rows['genes']
            else:
                genes = [g for g in rows['genes'].split(',') if len(g) > 1]
            cleaned.append(' '.join(genes))
        nodes['genes'] = cleaned

        # dont think this is needed
        # node_table.set_index(HiDeFHierarchyRefiner.TERMS_COL)

        # create a map of cluster name to stability values
        cluster_stability = pd.Series(node_table[HiDeFHierarchyRefiner.STABILITY_COL].values,
                                      index=node_table.terms).to_dict()

        # use the cluster_stability map to add stability values to nodes dataframe by
        # matching with the cluster name in the nodes (it is the index)
        nodes[HiDeFHierarchyRefiner.STABILITY_COL] = nodes.index.map(cluster_stability)

        # sort the nodes table by term size
        nodes.sort_values(by='tsize', ascending=False, inplace=True)

        # Todo: Register .pruned.nodes and .pruned.edges with fairscape

        # dump out new nodes file in HiDeF format
        nodes.to_csv(outprefix + '.pruned.nodes', header=False, sep='\t')

        # dump out HiDeF edges file
        edges = edge_df.loc[edge_df['type'] == 'default', :]
        edges.to_csv(outprefix+'.pruned.edges', sep='\t', header=None,index=None)

        logger.debug('Number of edges is ' + str(len(edges)) + ', number of nodes are ' + str(len(nodes)))
        if self._provenance_utils is not None:
            return self._register_pruned_hidef_output_files(outprefix + '.pruned')
        else:
            return list()