Source code for enadepy.frequent

# The MIT License (MIT)
#
# Copyright (c) 2020 M. Choji
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""A module for frequent itemsets mining."""

import pandas
from mlxtend.frequent_patterns import association_rules, fpgrowth
from typing import Any, List, Set, TypeVar

PandasDataFrame = TypeVar('PandasDataFrame', bound=pandas.core.frame.DataFrame)


[docs]def freq_itemsets(
    dataframe: PandasDataFrame, **kwargs: Any
) -> PandasDataFrame:
    """Generates frequent itemsets from dataframe in transactions mode.

    Note:
        A dataframe in transaction mode is one in which all the columns
        contain binary values, like True or False.

    Args:
        dataframe (PandasDataFrame): A pandas DataFrame in transaction
            mode.
        **kwargs (Any): Any arguments to be passed to function
            `mlxtend.frequent_patterns.fpgrowth`.

    Returns:
        PandasDataFrame: A pandas DataFrame containing the frequent
        itemsets with the support and length for each itemset.
    """
    res = fpgrowth(dataframe, **kwargs)
    res['length'] = res['itemsets'].apply(lambda x: len(x))

    return res


def _is_closed(itemset: Set, same_sup_itemsets: List[Set]) -> bool:
    """Verifies if a given itemset is a closed frequent itemset.

    Note:
        This function is not supposed to be used externally.

    Args:
        itemset (Set): The itemset to test for closed frequent itemset.
        same_sup_itemsets (List[Set]): A list of itemset with same
            support.

    Returns:
        bool: True if `itemset` is a closed frequent itemset.
    """
    check = [itemset.issubset(x) for x in same_sup_itemsets if x != itemset]
    return not bool(check.count(True))


[docs]def closed_freq_itemsets(
    dataframe: PandasDataFrame, **kwargs: Any
) -> PandasDataFrame:
    """Generates frequent itemsets using FP-Growth.

    Generates frequent itemsets as of those generated by
    `mlxtend.frequent_patterns.fpgrowth` but with two additional columns
    indicating if the itemset is a closed frequent itemset and its
    length.

    Args:
        dataframe (PandasDataFrame): A pandas DataFrame in transaction
            mode.
        **kwargs (Any): Any arguments to be passed to function
            `mlxtend.frequent_patterns.fpgrowth`.

    Returns:
        PandasDataFrame: A pandas DataFrame containing the frequent
        itemsets with the corresponding lengths and a indication if an
        itemset is a closed frequent itemset.

    See Also:
        mlxtend.frequent_patterns.fpgrowth
    """
    freq_isets = fpgrowth(dataframe, **kwargs)
    sup_uniq = freq_isets.support.unique()
    sup_dict = {}

    for sup in sup_uniq:
        samesup_isets = list(
            freq_isets.loc[freq_isets.support == sup, 'itemsets']
        )
        sup_dict[sup] = samesup_isets

    freq_isets['isclosed'] = freq_isets.apply(
        lambda x: _is_closed(x.itemsets, sup_dict[x.support]),
        axis=1,
    )
    freq_isets['length'] = freq_isets['itemsets'].apply(lambda x: len(x))
    return freq_isets


[docs]def freq_itemsets_sort(
    dataframe: PandasDataFrame,
    sort_by: str = 'support',
    ascending: bool = False,
    **kwargs: Any
) -> PandasDataFrame:
    """Generates sorted frequent itemsets.

    Same as freq_itemsets but with output sorted.

    Args:
        dataframe (PandasDataFrame): A pandas DataFrame in transaction
            mode.
        sort_by (str, optional): The column to use for sorting
            ('support' or 'length'). Defaults to 'support'.
        ascending (bool, optional): Sort output in ascending mode.
            Defaults to False.
        **kwargs (Any): Any arguments to be passed to function
            `mlxtend.frequent_patterns.fpgrowth`.

    Returns:
        PandasDataFrame: A pandas DataFrame containing the frequent
        itemsets with the support and length for each itemset.

    See Also:
        freq_itemsets
    """
    res = freq_itemsets(dataframe, **kwargs)
    if sort_by not in ['support', 'length']:
        raise ValueError(
            'Argument sort_by should be either "support" or "length"'
        )
    res.sort_values(by=sort_by, ascending=ascending, inplace=True)
    return res


[docs]def closed_freq_itemsets_sort(
    dataframe: PandasDataFrame,
    sort_by: str = 'support',
    ascending: bool = False,
    **kwargs: Any
) -> PandasDataFrame:
    """Generates sorted frequent itemsets using FP-Growth.

    Same as closed_freq_itemsets but with output sorted.

    Args:
        dataframe (PandasDataFrame): A pandas DataFrame in transaction
            mode.
        sort_by (str, optional): The column to use for sorting
            ('support' or 'length'). Defaults to 'support'.
        ascending (bool, optional): Sort output in ascending mode.
            Defaults to False.
        **kwargs (Any): Any arguments to be passed to function
            `mlxtend.frequent_patterns.fpgrowth`.

    Returns:
        PandasDataFrame: A pandas DataFrame containing the frequent
        itemsets with the corresponding lengths and a indication if an
        itemset is a closed frequent itemset.

    See Also:
        closed_freq_itemsets
    """
    res = closed_freq_itemsets(dataframe, **kwargs)
    if sort_by not in ['support', 'length']:
        raise ValueError(
            'Argument sort_by should be either "support" or "length"'
        )
    res.sort_values(by=sort_by, ascending=ascending, inplace=True)
    return res


[docs]def find_itemsets_all(
    freq_itemsets: PandasDataFrame,
    search: Set = set(),
    exact: bool = False,
    col_name: str = 'itemsets',
) -> PandasDataFrame:
    """Finds itemsets containing all the items given in query.

    Args:
        freq_itemsets (PandasDataFrame): The frequent itemsets where
            the search will be performed.
        search (Set, optional): Set with items to search for.
            Defaults to set().
        exact (bool, optional): Match only if itemset is equal to
            `search`. Defaults to False.
        col_name (str, optional): Column name where the itemsets reside.
            Defaults to 'itemsets'.

    Returns:
        PandasDataFrame: a pandas DataFrame containing the itemsets the
        match requisites.

    See Also:
        find_itemsets_any, find_itemsets_without
    """
    if exact:
        res = freq_itemsets.query(f'{col_name} == @search')
    else:
        idx = freq_itemsets[col_name].apply(lambda x: x.issuperset(search))
        res = freq_itemsets.loc[idx, :]

    return res


[docs]def find_itemsets_any(
    freq_itemsets: PandasDataFrame,
    search: Set = set(),
    col_name: str = 'itemsets',
) -> PandasDataFrame:
    """Finds itemsets containing any of the items given in query.

    Args:
        freq_itemsets (PandasDataFrame): The frequent itemsets where
            the search will be performed.
        search (Set, optional): Set with items to search for.
            Defaults to set().
        col_name (str, optional): Column name where the itemsets reside.
            Defaults to 'itemsets'.

    Returns:
        PandasDataFrame: a pandas DataFrame containing the itemsets the
        match requisites.

    See Also:
        find_itemsets_all, find_itemsets_without
    """
    found = []
    for x in search:
        x_found = find_itemsets_all(freq_itemsets, {x}, col_name=col_name)
        if x_found is not None:
            found.append(x_found)
    res = pandas.concat(found)
    res.drop_duplicates(inplace=True)
    return res


[docs]def find_itemsets_without(
    freq_itemsets: PandasDataFrame,
    search: Set = set(),
    col_name: str = 'itemsets'
) -> PandasDataFrame:
    """Finds itemsets that do not contain the items given in query.

    Args:
        freq_itemsets (PandasDataFrame): The frequent itemsets where
            the search will be performed.
        search (Set, optional): Set with items to exclude.
            Defaults to set().
        col_name (str, optional): Column name where the itemsets reside.
            Defaults to 'itemsets'.

    Returns:
        PandasDataFrame: a pandas DataFrame containing the itemsets the
        match requisites.

    See Also:
        find_itemsets_any, find_itemsets_all
    """
    exclude = find_itemsets_any(freq_itemsets, search, col_name)
    res = freq_itemsets.drop(index=exclude.index)
    return res


[docs]def association_rules_ext(
    freq_itemsets: PandasDataFrame, **kwargs: Any
) -> PandasDataFrame:
    """Generates association rules from frequent itemsets.

    This function extends the function
    `mlxtend.frequent_patterns.association_rules` by appending
    information about the length of both antecedent and consequent.
    If the frequent itemsets have indications of closed frequent
    itemsets, the output will also set this information for the
    components of the rule.

    Args:
        freq_itemsets (PandasDataFrame): A pandas DataFrame containing
            frequent itemsets.

    Returns:
        PandasDataFrame: A pandas DataFrame of association rules
        including the metrics 'support', 'confidence', 'leverage',
        'lift' and 'conviction'.

    See Also:
        freq_itemsets: generates frequent itemsets

        mlxtend.frequent_patterns.association_rules
    """
    rules = association_rules(freq_itemsets, **kwargs)
    rules['A_length'] = rules['antecedents'].apply(lambda x: len(x))
    rules['C_length'] = rules['consequents'].apply(lambda x: len(x))
    if 'isclosed' in freq_itemsets.columns:
        rules['A_isclosed'] = rules.merge(
            freq_itemsets[['itemsets', 'isclosed']],
            how='left',
            left_on='antecedents',
            right_on='itemsets'
        ).isclosed
        rules['C_isclosed'] = rules.merge(
            freq_itemsets[['itemsets', 'isclosed']],
            how='left',
            left_on='consequents',
            right_on='itemsets'
        ).isclosed
    return rules


[docs]def filter_rules(
    rules: PandasDataFrame,
    by: List[str] = ['conviction', 'support', 'lift']
) -> PandasDataFrame:
    """Excludes duplicated rules according to a given criteria.

    This function will sort the rules according to the columns specified
    and drop rows that contain the same items, considering the union of
    antecedent and consequent, as of the one with greatest values.

    Args:
        rules (PandasDataFrame): a pandas DataFrame containing
            association rules.
        by (List[str], optional): A list containing the precedence of
            columns to be used during rules sorting.
            Defaults to ['conviction', 'support', 'lift'].

    Returns:
        PandasDataFrame: a pandas DataFrame containing filtered rules.

    See Also:
        association_rules_ext, find_itemsets_any, find_itemsets_all
    """
    res = rules.sort_values(by=by, ascending=False)
    res['union'] = res.apply(
        lambda x: x.antecedents.union(x.consequents), axis=1
    )
    res.drop_duplicates(subset='union', inplace=True)
    res.drop(columns='union', inplace=True)
    return res