From 0ea8df7682158f82fb463292b08201ef6680993e Mon Sep 17 00:00:00 2001
From: Jeel Gajera <jeelgajera00@gmail.com>
Date: Sat, 21 Oct 2023 11:20:06 +0530
Subject: [PATCH 1/8] Add: FP Growth Algorithm

---
 DIRECTORY.md                  |   1 +
 machine_learning/fp_growth.py | 333 ++++++++++++++++++++++++++++++++++
 2 files changed, 334 insertions(+)
 create mode 100644 machine_learning/fp_growth.py

diff --git a/DIRECTORY.md b/DIRECTORY.md
index b92f8f877e97..df6692fdddae 100644
--- a/DIRECTORY.md
+++ b/DIRECTORY.md
@@ -541,6 +541,7 @@
   * [Dimensionality Reduction](machine_learning/dimensionality_reduction.py)
   * Forecasting
     * [Run](machine_learning/forecasting/run.py)
+  * [FP Growth Algorithm](machine_learning/fp_growth.py)
   * [Gradient Descent](machine_learning/gradient_descent.py)
   * [K Means Clust](machine_learning/k_means_clust.py)
   * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py)
diff --git a/machine_learning/fp_growth.py b/machine_learning/fp_growth.py
new file mode 100644
index 000000000000..eaf59693f231
--- /dev/null
+++ b/machine_learning/fp_growth.py
@@ -0,0 +1,333 @@
+"""
+The FP-Growth (Frequent Pattern Growth) algorithm is a widely used
+data mining technique for discovering frequent itemsets in
+large transaction databases.
+It overcomes some of the limitations of traditional methods like
+Apriori by efficiently constructing the FP-Tree
+
+WIKI: https://athena.ecs.csus.edu/~mei/associationcw/FpGrowth.html
+Examples: https://www.javatpoint.com/fp-growth-algorithm-in-data-mining
+"""
+
+from typing import Optional
+
+
+class TreeNode:
+    """
+    Initialize a TreeNode.
+
+    Args:
+        name_value (str): The name of the node.
+        num_occur (int): The number of occurrences of the node.
+        parent_node (TreeNode): The parent node.
+
+    Example:
+    >>> parent = TreeNode("Parent", 1, None)
+    >>> child = TreeNode("Child", 2, parent)
+    >>> child.name
+    'Child'
+    >>> child.count
+    2
+    """
+
+    def __init__(
+        self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None
+    ) -> None:
+        self.name = name_value
+        self.count = num_occur
+        self.node_link = None  # Initialize node_link to None
+        self.parent = parent_node
+        self.children: dict[str, TreeNode] = {}
+
+    def inc(self, num_occur: int) -> None:
+        self.count += num_occur
+
+    def disp(self, ind: int = 1) -> None:
+        print("  " * ind, self.name, " ", self.count)
+        for child in self.children.values():
+            child.disp(ind + 1)
+
+
+def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]:
+    """
+    Create FP tree
+
+    Args:
+        data_set (list): A list of transactions, where each transaction
+        is a list of items.
+        min_sup (int, optional): The minimum support threshold.
+        Items with support less than this will be pruned. Default is 1.
+
+    Returns:
+        TreeNode: The root of the FP-Tree.
+        dict: The header table.
+
+    Example:
+    >>> data_set = [
+    ...    ['A', 'B', 'C'],
+    ...    ['A', 'C'],
+    ...    ['A', 'B', 'E'],
+    ...    ['A', 'B', 'C', 'E'],
+    ...    ['B', 'E']
+    ... ]
+    >>> min_sup = 2
+    >>> fp_tree, header_table = create_tree(data_set, min_sup)
+
+    >>> sorted(list(header_table.keys()))
+    ['A', 'B', 'C', 'E']
+
+    >>> fp_tree.name
+    'Null Set'
+    >>> sorted(fp_tree.children.keys())
+    ['A', 'B']
+    >>> fp_tree.children['A'].name
+    'A'
+    >>> sorted(fp_tree.children['A'].children.keys())
+    ['B', 'C']
+
+    """
+    header_table: dict = {}
+    for trans in data_set:
+        for item in trans:
+            header_table[item] = header_table.get(item, [0, None])
+            header_table[item][0] += 1
+
+    for k in list(header_table.keys()):
+        if header_table[k][0] < min_sup:
+            del header_table[k]
+
+    freq_item_set = set(header_table.keys())
+
+    if len(freq_item_set) == 0:
+        return TreeNode("Null Set", 1, None), {}
+
+    for k in header_table:
+        header_table[k] = [header_table[k], None]
+
+    fp_tree = TreeNode("Null Set", 1, None)  # Parent is None for the root node
+    for tran_set in data_set:
+        local_d = {}
+        for item in tran_set:
+            if item in freq_item_set:
+                local_d[item] = header_table[item][0]
+        if len(local_d) > 0:
+            sorted_items = sorted(
+                local_d.items(), key=lambda item_info: item_info[1], reverse=True
+            )
+            ordered_items = [item[0] for item in sorted_items]
+            update_tree(ordered_items, fp_tree, header_table, 1)
+
+    return fp_tree, header_table
+
+
+def update_tree(items: list, in_tree: TreeNode, header_table: dict, count: int) -> None:
+    """
+    Update the FP-Tree with a transaction.
+
+    Args:
+        items (list): List of items in the transaction.
+        in_tree (TreeNode): The current node in the FP-Tree.
+        header_table (dict): The header table with item information.
+        count (int): The count of the transaction.
+
+    Example:
+    >>> data_set = [
+    ...    ['A', 'B', 'C'],
+    ...    ['A', 'C'],
+    ...    ['A', 'B', 'E'],
+    ...    ['A', 'B', 'C', 'E'],
+    ...    ['B', 'E']
+    ... ]
+    >>> min_sup = 2
+    >>> fp_tree, header_table = create_tree(data_set, min_sup)
+
+    >>> transaction = ['A', 'B', 'E']
+    >>> update_tree(transaction, fp_tree, header_table, 1)
+
+    >>> sorted(fp_tree.children['A'].children['B'].children['E'].children.keys())
+    []
+    >>> fp_tree.children['A'].children['B'].children['E'].count
+    2
+    >>> header_table['E'][1].name
+    'E'
+    """
+    if items[0] in in_tree.children:
+        in_tree.children[items[0]].inc(count)
+    else:
+        in_tree.children[items[0]] = TreeNode(items[0], count, in_tree)
+        if header_table[items[0]][1] is None:
+            header_table[items[0]][1] = in_tree.children[items[0]]
+        else:
+            update_header(header_table[items[0]][1], in_tree.children[items[0]])
+    if len(items) > 1:
+        update_tree(items[1:], in_tree.children[items[0]], header_table, count)
+
+
+def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode:
+    """
+    Update the header table with a node link.
+
+    Args:
+        node_to_test (TreeNode): The node to be updated in the header table.
+        target_node (TreeNode): The node to link to.
+
+    Example:
+    >>> data_set = [
+    ...    ['A', 'B', 'C'],
+    ...    ['A', 'C'],
+    ...    ['A', 'B', 'E'],
+    ...    ['A', 'B', 'C', 'E'],
+    ...    ['B', 'E']
+    ... ]
+    >>> min_sup = 2
+    >>> fp_tree, header_table = create_tree(data_set, min_sup)
+
+    >>> node1 = TreeNode("A", 3, None)
+    >>> node2 = TreeNode("B", 4, None)
+    >>> node1 = update_header(node1, node2)
+    >>> node1.node_link.name
+    'B'
+    >>> node2.node_link is None
+    True
+    """
+    while node_to_test.node_link is not None:
+        node_to_test = node_to_test.node_link
+    if node_to_test.node_link is None:
+        node_to_test.node_link = TreeNode(target_node.name, target_node.count, node_to_test)
+    # Return the updated node
+    return node_to_test
+
+
+def ascend_tree(leaf_node: TreeNode, prefix_path: list) -> None:
+    """
+    Ascend the FP-Tree from a leaf node to its root,
+    adding item names to the prefix path.
+
+    Args:
+        leaf_node (TreeNode): The leaf node to start ascending from.
+        prefix_path (list): A list to store the item as they are ascended.
+
+    Example:
+    >>> data_set = [
+    ...    ['A', 'B', 'C'],
+    ...    ['A', 'C'],
+    ...    ['A', 'B', 'E'],
+    ...    ['A', 'B', 'C', 'E'],
+    ...    ['B', 'E']
+    ... ]
+    >>> min_sup = 2
+    >>> fp_tree, header_table = create_tree(data_set, min_sup)
+
+    >>> path = []
+    >>> ascend_tree(fp_tree.children['A'], path)
+    >>> path # ascending from a leaf node 'A'
+    ['A']
+    """
+    if leaf_node.parent is not None:
+        prefix_path.append(leaf_node.name)
+        ascend_tree(leaf_node.parent, prefix_path)
+
+
+def find_prefix_path(base_pat: frozenset, tree_node: TreeNode | None) -> dict:
+    """
+    Find the conditional pattern base for a given base pattern.
+
+    Args:
+        base_pat (frozenset): The base pattern for which to find
+        the conditional pattern base.
+        tree_node (TreeNode): The node in the FP-Tree.
+
+    Example:
+    >>> data_set = [
+    ...    ['A', 'B', 'C'],
+    ...    ['A', 'C'],
+    ...    ['A', 'B', 'E'],
+    ...    ['A', 'B', 'C', 'E'],
+    ...    ['B', 'E']
+    ... ]
+    >>> min_sup = 2
+    >>> fp_tree, header_table = create_tree(data_set, min_sup)
+    >>> base_pattern = frozenset(['A'])
+    >>> cond_pat = find_prefix_path(base_pattern, fp_tree.children['A'])
+    >>> sorted(cond_pat.keys())
+    []
+    """
+    cond_pats: dict = {}
+    while tree_node is not None:
+        prefix_path: list = []
+        ascend_tree(tree_node, prefix_path)
+        if len(prefix_path) > 1:
+            cond_pats[frozenset(prefix_path[1:])] = tree_node.count
+        tree_node = tree_node.node_link
+    return cond_pats
+
+
+def mine_tree(
+    in_tree: TreeNode,
+    header_table: dict,
+    min_sup: int,
+    pre_fix: set,
+    freq_item_list: list,
+) -> None:
+    """
+    Mine the FP-Tree recursively to discover frequent itemsets.
+
+    Args:
+        in_tree (TreeNode): The FP-Tree to mine.
+        header_table (dict): The header table with item information.
+        min_sup (int): The minimum support threshold.
+        pre_fix (set): A set of items as a prefix for the itemsets being mined.
+        freq_item_list (list): A list to store the frequent itemsets.
+
+    Example:
+    >>> data_set = [
+    ...    ['A', 'B', 'C'],
+    ...    ['A', 'C'],
+    ...    ['A', 'B', 'E'],
+    ...    ['A', 'B', 'C', 'E'],
+    ...    ['B', 'E']
+    ... ]
+    >>> min_sup = 2
+    >>> fp_tree, header_table = create_tree(data_set, min_sup)
+
+    >>> frequent_itemsets = []
+    >>> mine_tree(fp_tree, header_table, min_sup, set([]), frequent_itemsets)
+    >>> expe_itm = [{'C'}, {'C', 'A'}, {'E'}, {'A', 'E'}, {'E', 'B'}, {'A'}, {'B'}]
+    >>> all(expected in frequent_itemsets for expected in expe_itm)
+    True
+    """
+    sorted_items = sorted(header_table.items(), key=lambda item_info: item_info[1][0])
+    big_l = [item[0] for item in sorted_items]
+    for base_pat in big_l:
+        new_freq_set = pre_fix.copy()
+        new_freq_set.add(base_pat)
+        freq_item_list.append(new_freq_set)
+        cond_patt_bases = find_prefix_path(base_pat, header_table[base_pat][1])
+        my_cond_tree, my_head = create_tree(list(cond_patt_bases.keys()), min_sup)
+        if my_head is not None:
+            # Pass header_table[base_pat][1] as node_to_test to update_header
+            header_table[base_pat][1] = update_header(
+                header_table[base_pat][1], my_cond_tree
+            )
+            mine_tree(my_cond_tree, my_head, min_sup, new_freq_set, freq_item_list)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
+
+    data_set: list = [
+        frozenset(["bread", "milk", "cheese"]),
+        frozenset(["bread", "milk"]),
+        frozenset(["bread", "diapers"]),
+        frozenset(["bread", "milk", "diapers"]),
+        frozenset(["milk", "diapers"]),
+        frozenset(["milk", "cheese"]),
+        frozenset(["diapers", "cheese"]),
+        frozenset(["bread", "milk", "cheese", "diapers"]),
+    ]
+    fp_tree, header_table = create_tree(data_set, min_sup=3)
+    freq_items: list = []
+    mine_tree(fp_tree, header_table, 3, set(), freq_items)
+    print(freq_items)

From 71776e78022d6a9699f2ced68bd28d6667e7335e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 21 Oct 2023 05:52:58 +0000
Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/fp_growth.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/machine_learning/fp_growth.py b/machine_learning/fp_growth.py
index eaf59693f231..356e95347079 100644
--- a/machine_learning/fp_growth.py
+++ b/machine_learning/fp_growth.py
@@ -193,7 +193,9 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode:
     while node_to_test.node_link is not None:
         node_to_test = node_to_test.node_link
     if node_to_test.node_link is None:
-        node_to_test.node_link = TreeNode(target_node.name, target_node.count, node_to_test)
+        node_to_test.node_link = TreeNode(
+            target_node.name, target_node.count, node_to_test
+        )
     # Return the updated node
     return node_to_test
 

From c0470094d01391294617df6a92734b78b470b127 Mon Sep 17 00:00:00 2001
From: Jeel Gajera <jeelgajera00@gmail.com>
Date: Sat, 21 Oct 2023 14:48:21 +0530
Subject: [PATCH 3/8] changes names

---
 DIRECTORY.md                                  |  2 +-
 ...p_growth.py => frequent_pattern_growth.py} | 32 +++++++++++--------
 2 files changed, 20 insertions(+), 14 deletions(-)
 rename machine_learning/{fp_growth.py => frequent_pattern_growth.py} (92%)

diff --git a/DIRECTORY.md b/DIRECTORY.md
index df6692fdddae..916d993c563a 100644
--- a/DIRECTORY.md
+++ b/DIRECTORY.md
@@ -541,7 +541,7 @@
   * [Dimensionality Reduction](machine_learning/dimensionality_reduction.py)
   * Forecasting
     * [Run](machine_learning/forecasting/run.py)
-  * [FP Growth Algorithm](machine_learning/fp_growth.py)
+  * [Frequent Pattern Growth Algorithm](machine_learning/frequent_pattern_growth.py)
   * [Gradient Descent](machine_learning/gradient_descent.py)
   * [K Means Clust](machine_learning/k_means_clust.py)
   * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py)
diff --git a/machine_learning/fp_growth.py b/machine_learning/frequent_pattern_growth.py
similarity index 92%
rename from machine_learning/fp_growth.py
rename to machine_learning/frequent_pattern_growth.py
index 356e95347079..fa37313be9da 100644
--- a/machine_learning/fp_growth.py
+++ b/machine_learning/frequent_pattern_growth.py
@@ -10,8 +10,10 @@
 """
 
 from typing import Optional
+from dataclasses import dataclass, field
 
 
+@dataclass
 class TreeNode:
     """
     Initialize a TreeNode.
@@ -30,14 +32,19 @@ class TreeNode:
     2
     """
 
-    def __init__(
-        self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None
-    ) -> None:
-        self.name = name_value
-        self.count = num_occur
-        self.node_link = None  # Initialize node_link to None
-        self.parent = parent_node
-        self.children: dict[str, TreeNode] = {}
+    # def __init__(
+    #     self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None
+    # ) -> None:
+    #     self.name = name_value
+    #     self.count = num_occur
+    #     self.node_link = TreeNode | None  # Initialize node_link to None
+    #     self.parent = parent_node
+    #     self.children: dict[str, TreeNode] = {}
+    name: str
+    count: int
+    node_link: Optional['TreeNode'] = None  # Initialize node_link to None
+    parent: Optional["TreeNode"] = None
+    children: dict[str, "TreeNode"] = field(default_factory=dict)
 
     def inc(self, num_occur: int) -> None:
         self.count += num_occur
@@ -50,7 +57,7 @@ def disp(self, ind: int = 1) -> None:
 
 def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]:
     """
-    Create FP tree
+    Create Frequent Pattern tree
 
     Args:
         data_set (list): A list of transactions, where each transaction
@@ -193,10 +200,7 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode:
     while node_to_test.node_link is not None:
         node_to_test = node_to_test.node_link
     if node_to_test.node_link is None:
-        node_to_test.node_link = TreeNode(
-            target_node.name, target_node.count, node_to_test
-        )
-    # Return the updated node
+        node_to_test.node_link = target_node
     return node_to_test
 
 
@@ -298,6 +302,7 @@ def mine_tree(
     >>> all(expected in frequent_itemsets for expected in expe_itm)
     True
     """
+    new_head: Optional['TreeNode'] = None
     sorted_items = sorted(header_table.items(), key=lambda item_info: item_info[1][0])
     big_l = [item[0] for item in sorted_items]
     for base_pat in big_l:
@@ -311,6 +316,7 @@ def mine_tree(
             header_table[base_pat][1] = update_header(
                 header_table[base_pat][1], my_cond_tree
             )
+            my_head = new_head
             mine_tree(my_cond_tree, my_head, min_sup, new_freq_set, freq_item_list)
 
 

From c014b1c006e1a5a7e3ba41856ad736e843df8c34 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 21 Oct 2023 09:19:07 +0000
Subject: [PATCH 4/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/frequent_pattern_growth.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/machine_learning/frequent_pattern_growth.py b/machine_learning/frequent_pattern_growth.py
index fa37313be9da..3d5d6609fc53 100644
--- a/machine_learning/frequent_pattern_growth.py
+++ b/machine_learning/frequent_pattern_growth.py
@@ -42,7 +42,7 @@ class TreeNode:
     #     self.children: dict[str, TreeNode] = {}
     name: str
     count: int
-    node_link: Optional['TreeNode'] = None  # Initialize node_link to None
+    node_link: Optional["TreeNode"] = None  # Initialize node_link to None
     parent: Optional["TreeNode"] = None
     children: dict[str, "TreeNode"] = field(default_factory=dict)
 
@@ -302,7 +302,7 @@ def mine_tree(
     >>> all(expected in frequent_itemsets for expected in expe_itm)
     True
     """
-    new_head: Optional['TreeNode'] = None
+    new_head: Optional["TreeNode"] = None
     sorted_items = sorted(header_table.items(), key=lambda item_info: item_info[1][0])
     big_l = [item[0] for item in sorted_items]
     for base_pat in big_l:

From 3ae692d129e5b60df7baafbd09f2005a4dc305ef Mon Sep 17 00:00:00 2001
From: Jeel Gajera <jeelgajera00@gmail.com>
Date: Sat, 21 Oct 2023 14:53:49 +0530
Subject: [PATCH 5/8] Revert "changes names"

This reverts commit c0470094d01391294617df6a92734b78b470b127.
---
 DIRECTORY.md                                  |  2 +-
 ...requent_pattern_growth.py => fp_growth.py} | 32 ++++++++-----------
 2 files changed, 14 insertions(+), 20 deletions(-)
 rename machine_learning/{frequent_pattern_growth.py => fp_growth.py} (92%)

diff --git a/DIRECTORY.md b/DIRECTORY.md
index 916d993c563a..df6692fdddae 100644
--- a/DIRECTORY.md
+++ b/DIRECTORY.md
@@ -541,7 +541,7 @@
   * [Dimensionality Reduction](machine_learning/dimensionality_reduction.py)
   * Forecasting
     * [Run](machine_learning/forecasting/run.py)
-  * [Frequent Pattern Growth Algorithm](machine_learning/frequent_pattern_growth.py)
+  * [FP Growth Algorithm](machine_learning/fp_growth.py)
   * [Gradient Descent](machine_learning/gradient_descent.py)
   * [K Means Clust](machine_learning/k_means_clust.py)
   * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py)
diff --git a/machine_learning/frequent_pattern_growth.py b/machine_learning/fp_growth.py
similarity index 92%
rename from machine_learning/frequent_pattern_growth.py
rename to machine_learning/fp_growth.py
index fa37313be9da..356e95347079 100644
--- a/machine_learning/frequent_pattern_growth.py
+++ b/machine_learning/fp_growth.py
@@ -10,10 +10,8 @@
 """
 
 from typing import Optional
-from dataclasses import dataclass, field
 
 
-@dataclass
 class TreeNode:
     """
     Initialize a TreeNode.
@@ -32,19 +30,14 @@ class TreeNode:
     2
     """
 
-    # def __init__(
-    #     self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None
-    # ) -> None:
-    #     self.name = name_value
-    #     self.count = num_occur
-    #     self.node_link = TreeNode | None  # Initialize node_link to None
-    #     self.parent = parent_node
-    #     self.children: dict[str, TreeNode] = {}
-    name: str
-    count: int
-    node_link: Optional['TreeNode'] = None  # Initialize node_link to None
-    parent: Optional["TreeNode"] = None
-    children: dict[str, "TreeNode"] = field(default_factory=dict)
+    def __init__(
+        self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None
+    ) -> None:
+        self.name = name_value
+        self.count = num_occur
+        self.node_link = None  # Initialize node_link to None
+        self.parent = parent_node
+        self.children: dict[str, TreeNode] = {}
 
     def inc(self, num_occur: int) -> None:
         self.count += num_occur
@@ -57,7 +50,7 @@ def disp(self, ind: int = 1) -> None:
 
 def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]:
     """
-    Create Frequent Pattern tree
+    Create FP tree
 
     Args:
         data_set (list): A list of transactions, where each transaction
@@ -200,7 +193,10 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode:
     while node_to_test.node_link is not None:
         node_to_test = node_to_test.node_link
     if node_to_test.node_link is None:
-        node_to_test.node_link = target_node
+        node_to_test.node_link = TreeNode(
+            target_node.name, target_node.count, node_to_test
+        )
+    # Return the updated node
     return node_to_test
 
 
@@ -302,7 +298,6 @@ def mine_tree(
     >>> all(expected in frequent_itemsets for expected in expe_itm)
     True
     """
-    new_head: Optional['TreeNode'] = None
     sorted_items = sorted(header_table.items(), key=lambda item_info: item_info[1][0])
     big_l = [item[0] for item in sorted_items]
     for base_pat in big_l:
@@ -316,7 +311,6 @@ def mine_tree(
             header_table[base_pat][1] = update_header(
                 header_table[base_pat][1], my_cond_tree
             )
-            my_head = new_head
             mine_tree(my_cond_tree, my_head, min_sup, new_freq_set, freq_item_list)
 
 

From 973ae0200870f88ea0b2b17e40abc0890582df57 Mon Sep 17 00:00:00 2001
From: Jeel Gajera <jeelgajera00@gmail.com>
Date: Sat, 21 Oct 2023 15:59:24 +0530
Subject: [PATCH 6/8] refactore code

---
 DIRECTORY.md                                  |  2 +-
 ...p_growth.py => frequent_pattern_growth.py} | 23 +++++++++++--------
 2 files changed, 15 insertions(+), 10 deletions(-)
 rename machine_learning/{fp_growth.py => frequent_pattern_growth.py} (95%)

diff --git a/DIRECTORY.md b/DIRECTORY.md
index df6692fdddae..916d993c563a 100644
--- a/DIRECTORY.md
+++ b/DIRECTORY.md
@@ -541,7 +541,7 @@
   * [Dimensionality Reduction](machine_learning/dimensionality_reduction.py)
   * Forecasting
     * [Run](machine_learning/forecasting/run.py)
-  * [FP Growth Algorithm](machine_learning/fp_growth.py)
+  * [Frequent Pattern Growth Algorithm](machine_learning/frequent_pattern_growth.py)
   * [Gradient Descent](machine_learning/gradient_descent.py)
   * [K Means Clust](machine_learning/k_means_clust.py)
   * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py)
diff --git a/machine_learning/fp_growth.py b/machine_learning/frequent_pattern_growth.py
similarity index 95%
rename from machine_learning/fp_growth.py
rename to machine_learning/frequent_pattern_growth.py
index 356e95347079..df37eeee2f08 100644
--- a/machine_learning/fp_growth.py
+++ b/machine_learning/frequent_pattern_growth.py
@@ -9,9 +9,10 @@
 Examples: https://www.javatpoint.com/fp-growth-algorithm-in-data-mining
 """
 
+# from dataclasses import dataclass, field
 from typing import Optional
 
-
+# @dataclass
 class TreeNode:
     """
     Initialize a TreeNode.
@@ -31,7 +32,8 @@ class TreeNode:
     """
 
     def __init__(
-        self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None
+        self, name_value: str, num_occur: int, 
+    parent_node: Optional["TreeNode"] = None
     ) -> None:
         self.name = name_value
         self.count = num_occur
@@ -39,6 +41,13 @@ def __init__(
         self.parent = parent_node
         self.children: dict[str, TreeNode] = {}
 
+    # name: str
+    # count: int
+    # node_link: Optional["TreeNode"] = None
+    # parent: Optional["TreeNode"] = None
+    # children: dict[str, "TreeNode"] = field(default_factory=dict)
+
+
     def inc(self, num_occur: int) -> None:
         self.count += num_occur
 
@@ -50,7 +59,7 @@ def disp(self, ind: int = 1) -> None:
 
 def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]:
     """
-    Create FP tree
+    Create Frequent Pattern tree
 
     Args:
         data_set (list): A list of transactions, where each transaction
@@ -96,9 +105,7 @@ def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]:
         if header_table[k][0] < min_sup:
             del header_table[k]
 
-    freq_item_set = set(header_table.keys())
-
-    if len(freq_item_set) == 0:
+    if not (freq_item_set := set(header_table)):
         return TreeNode("Null Set", 1, None), {}
 
     for k in header_table:
@@ -193,9 +200,7 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode:
     while node_to_test.node_link is not None:
         node_to_test = node_to_test.node_link
     if node_to_test.node_link is None:
-        node_to_test.node_link = TreeNode(
-            target_node.name, target_node.count, node_to_test
-        )
+        node_to_test.node_link = target_node
     # Return the updated node
     return node_to_test
 

From 8a1f71b812b678097cb6e962c01aa04cd6c53ee4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 21 Oct 2023 10:30:26 +0000
Subject: [PATCH 7/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/frequent_pattern_growth.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/machine_learning/frequent_pattern_growth.py b/machine_learning/frequent_pattern_growth.py
index df37eeee2f08..b58ef5d45746 100644
--- a/machine_learning/frequent_pattern_growth.py
+++ b/machine_learning/frequent_pattern_growth.py
@@ -12,6 +12,7 @@
 # from dataclasses import dataclass, field
 from typing import Optional
 
+
 # @dataclass
 class TreeNode:
     """
@@ -32,8 +33,7 @@ class TreeNode:
     """
 
     def __init__(
-        self, name_value: str, num_occur: int, 
-    parent_node: Optional["TreeNode"] = None
+        self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None
     ) -> None:
         self.name = name_value
         self.count = num_occur
@@ -47,7 +47,6 @@ def __init__(
     # parent: Optional["TreeNode"] = None
     # children: dict[str, "TreeNode"] = field(default_factory=dict)
 
-
     def inc(self, num_occur: int) -> None:
         self.count += num_occur
 

From d4ded62a0d217749b430ebac926ba1a4dc1a81ec Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Sat, 21 Oct 2023 16:47:43 +0200
Subject: [PATCH 8/8] Update frequent_pattern_growth.py

---
 machine_learning/frequent_pattern_growth.py | 168 +++++++++++---------
 1 file changed, 89 insertions(+), 79 deletions(-)

diff --git a/machine_learning/frequent_pattern_growth.py b/machine_learning/frequent_pattern_growth.py
index b58ef5d45746..205d598464a1 100644
--- a/machine_learning/frequent_pattern_growth.py
+++ b/machine_learning/frequent_pattern_growth.py
@@ -1,27 +1,28 @@
 """
-The FP-Growth (Frequent Pattern Growth) algorithm is a widely used
-data mining technique for discovering frequent itemsets in
-large transaction databases.
-It overcomes some of the limitations of traditional methods like
-Apriori by efficiently constructing the FP-Tree
+The Frequent Pattern Growth algorithm (FP-Growth) is a widely used data mining
+technique for discovering frequent itemsets in large transaction databases.
+
+It overcomes some of the limitations of traditional methods such as Apriori by
+efficiently constructing the FP-Tree
 
 WIKI: https://athena.ecs.csus.edu/~mei/associationcw/FpGrowth.html
+
 Examples: https://www.javatpoint.com/fp-growth-algorithm-in-data-mining
 """
+from __future__ import annotations
 
-# from dataclasses import dataclass, field
-from typing import Optional
+from dataclasses import dataclass, field
 
 
-# @dataclass
+@dataclass
 class TreeNode:
     """
-    Initialize a TreeNode.
+    A node in a Frequent Pattern tree.
 
     Args:
-        name_value (str): The name of the node.
-        num_occur (int): The number of occurrences of the node.
-        parent_node (TreeNode): The parent node.
+        name: The name of this node.
+        num_occur: The number of occurrences of the node.
+        parent_node: The parent node.
 
     Example:
     >>> parent = TreeNode("Parent", 1, None)
@@ -32,26 +33,20 @@ class TreeNode:
     2
     """
 
-    def __init__(
-        self, name_value: str, num_occur: int, parent_node: Optional["TreeNode"] = None
-    ) -> None:
-        self.name = name_value
-        self.count = num_occur
-        self.node_link = None  # Initialize node_link to None
-        self.parent = parent_node
-        self.children: dict[str, TreeNode] = {}
-
-    # name: str
-    # count: int
-    # node_link: Optional["TreeNode"] = None
-    # parent: Optional["TreeNode"] = None
-    # children: dict[str, "TreeNode"] = field(default_factory=dict)
+    name: str
+    count: int
+    parent: TreeNode | None = None
+    children: dict[str, TreeNode] = field(default_factory=dict)
+    node_link: TreeNode | None = None
+
+    def __repr__(self) -> str:
+        return f"TreeNode({self.name!r}, {self.count!r}, {self.parent!r})"
 
     def inc(self, num_occur: int) -> None:
         self.count += num_occur
 
     def disp(self, ind: int = 1) -> None:
-        print("  " * ind, self.name, " ", self.count)
+        print(f"{'  ' * ind} {self.name}  {self.count}")
         for child in self.children.values():
             child.disp(ind + 1)
 
@@ -61,14 +56,13 @@ def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]:
     Create Frequent Pattern tree
 
     Args:
-        data_set (list): A list of transactions, where each transaction
-        is a list of items.
-        min_sup (int, optional): The minimum support threshold.
+        data_set: A list of transactions, where each transaction is a list of items.
+        min_sup: The minimum support threshold.
         Items with support less than this will be pruned. Default is 1.
 
     Returns:
-        TreeNode: The root of the FP-Tree.
-        dict: The header table.
+        The root of the FP-Tree.
+        header_table: The header table dictionary with item information.
 
     Example:
     >>> data_set = [
@@ -80,19 +74,24 @@ def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]:
     ... ]
     >>> min_sup = 2
     >>> fp_tree, header_table = create_tree(data_set, min_sup)
-
-    >>> sorted(list(header_table.keys()))
+    >>> fp_tree
+    TreeNode('Null Set', 1, None)
+    >>> len(header_table)
+    4
+    >>> header_table["A"]
+    [[4, None], TreeNode('A', 4, TreeNode('Null Set', 1, None))]
+    >>> header_table["E"][1]  # doctest: +NORMALIZE_WHITESPACE
+    TreeNode('E', 1, TreeNode('B', 3, TreeNode('A', 4, TreeNode('Null Set', 1, None))))
+    >>> sorted(header_table)
     ['A', 'B', 'C', 'E']
-
     >>> fp_tree.name
     'Null Set'
-    >>> sorted(fp_tree.children.keys())
+    >>> sorted(fp_tree.children)
     ['A', 'B']
     >>> fp_tree.children['A'].name
     'A'
-    >>> sorted(fp_tree.children['A'].children.keys())
+    >>> sorted(fp_tree.children['A'].children)
     ['B', 'C']
-
     """
     header_table: dict = {}
     for trans in data_set:
@@ -100,7 +99,7 @@ def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]:
             header_table[item] = header_table.get(item, [0, None])
             header_table[item][0] += 1
 
-    for k in list(header_table.keys()):
+    for k in list(header_table):
         if header_table[k][0] < min_sup:
             del header_table[k]
 
@@ -112,11 +111,10 @@ def create_tree(data_set: list, min_sup: int = 1) -> tuple[TreeNode, dict]:
 
     fp_tree = TreeNode("Null Set", 1, None)  # Parent is None for the root node
     for tran_set in data_set:
-        local_d = {}
-        for item in tran_set:
-            if item in freq_item_set:
-                local_d[item] = header_table[item][0]
-        if len(local_d) > 0:
+        local_d = {
+            item: header_table[item][0] for item in tran_set if item in freq_item_set
+        }
+        if local_d:
             sorted_items = sorted(
                 local_d.items(), key=lambda item_info: item_info[1], reverse=True
             )
@@ -131,10 +129,10 @@ def update_tree(items: list, in_tree: TreeNode, header_table: dict, count: int)
     Update the FP-Tree with a transaction.
 
     Args:
-        items (list): List of items in the transaction.
-        in_tree (TreeNode): The current node in the FP-Tree.
-        header_table (dict): The header table with item information.
-        count (int): The count of the transaction.
+        items: List of items in the transaction.
+        in_tree: The current node in the FP-Tree.
+        header_table: The header table dictionary with item information.
+        count: The count of the transaction.
 
     Example:
     >>> data_set = [
@@ -146,12 +144,14 @@ def update_tree(items: list, in_tree: TreeNode, header_table: dict, count: int)
     ... ]
     >>> min_sup = 2
     >>> fp_tree, header_table = create_tree(data_set, min_sup)
-
+    >>> fp_tree
+    TreeNode('Null Set', 1, None)
     >>> transaction = ['A', 'B', 'E']
     >>> update_tree(transaction, fp_tree, header_table, 1)
-
-    >>> sorted(fp_tree.children['A'].children['B'].children['E'].children.keys())
-    []
+    >>> fp_tree
+    TreeNode('Null Set', 1, None)
+    >>> fp_tree.children['A'].children['B'].children['E'].children
+    {}
     >>> fp_tree.children['A'].children['B'].children['E'].count
     2
     >>> header_table['E'][1].name
@@ -174,8 +174,8 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode:
     Update the header table with a node link.
 
     Args:
-        node_to_test (TreeNode): The node to be updated in the header table.
-        target_node (TreeNode): The node to link to.
+        node_to_test: The node to be updated in the header table.
+        target_node: The node to link to.
 
     Example:
     >>> data_set = [
@@ -187,12 +187,17 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode:
     ... ]
     >>> min_sup = 2
     >>> fp_tree, header_table = create_tree(data_set, min_sup)
-
+    >>> fp_tree
+    TreeNode('Null Set', 1, None)
     >>> node1 = TreeNode("A", 3, None)
     >>> node2 = TreeNode("B", 4, None)
+    >>> node1
+    TreeNode('A', 3, None)
     >>> node1 = update_header(node1, node2)
-    >>> node1.node_link.name
-    'B'
+    >>> node1
+    TreeNode('A', 3, None)
+    >>> node1.node_link
+    TreeNode('B', 4, None)
     >>> node2.node_link is None
     True
     """
@@ -204,14 +209,14 @@ def update_header(node_to_test: TreeNode, target_node: TreeNode) -> TreeNode:
     return node_to_test
 
 
-def ascend_tree(leaf_node: TreeNode, prefix_path: list) -> None:
+def ascend_tree(leaf_node: TreeNode, prefix_path: list[str]) -> None:
     """
-    Ascend the FP-Tree from a leaf node to its root,
-    adding item names to the prefix path.
+    Ascend the FP-Tree from a leaf node to its root, adding item names to the prefix
+    path.
 
     Args:
-        leaf_node (TreeNode): The leaf node to start ascending from.
-        prefix_path (list): A list to store the item as they are ascended.
+        leaf_node: The leaf node to start ascending from.
+        prefix_path: A list to store the item as they are ascended.
 
     Example:
     >>> data_set = [
@@ -239,9 +244,8 @@ def find_prefix_path(base_pat: frozenset, tree_node: TreeNode | None) -> dict:
     Find the conditional pattern base for a given base pattern.
 
     Args:
-        base_pat (frozenset): The base pattern for which to find
-        the conditional pattern base.
-        tree_node (TreeNode): The node in the FP-Tree.
+        base_pat: The base pattern for which to find the conditional pattern base.
+        tree_node: The node in the FP-Tree.
 
     Example:
     >>> data_set = [
@@ -253,9 +257,12 @@ def find_prefix_path(base_pat: frozenset, tree_node: TreeNode | None) -> dict:
     ... ]
     >>> min_sup = 2
     >>> fp_tree, header_table = create_tree(data_set, min_sup)
+    >>> fp_tree
+    TreeNode('Null Set', 1, None)
+    >>> len(header_table)
+    4
     >>> base_pattern = frozenset(['A'])
-    >>> cond_pat = find_prefix_path(base_pattern, fp_tree.children['A'])
-    >>> sorted(cond_pat.keys())
+    >>> sorted(find_prefix_path(base_pattern, fp_tree.children['A']))
     []
     """
     cond_pats: dict = {}
@@ -279,11 +286,11 @@ def mine_tree(
     Mine the FP-Tree recursively to discover frequent itemsets.
 
     Args:
-        in_tree (TreeNode): The FP-Tree to mine.
-        header_table (dict): The header table with item information.
-        min_sup (int): The minimum support threshold.
-        pre_fix (set): A set of items as a prefix for the itemsets being mined.
-        freq_item_list (list): A list to store the frequent itemsets.
+        in_tree: The FP-Tree to mine.
+        header_table: The header table dictionary with item information.
+        min_sup: The minimum support threshold.
+        pre_fix: A set of items as a prefix for the itemsets being mined.
+        freq_item_list: A list to store the frequent itemsets.
 
     Example:
     >>> data_set = [
@@ -295,7 +302,8 @@ def mine_tree(
     ... ]
     >>> min_sup = 2
     >>> fp_tree, header_table = create_tree(data_set, min_sup)
-
+    >>> fp_tree
+    TreeNode('Null Set', 1, None)
     >>> frequent_itemsets = []
     >>> mine_tree(fp_tree, header_table, min_sup, set([]), frequent_itemsets)
     >>> expe_itm = [{'C'}, {'C', 'A'}, {'E'}, {'A', 'E'}, {'E', 'B'}, {'A'}, {'B'}]
@@ -309,7 +317,7 @@ def mine_tree(
         new_freq_set.add(base_pat)
         freq_item_list.append(new_freq_set)
         cond_patt_bases = find_prefix_path(base_pat, header_table[base_pat][1])
-        my_cond_tree, my_head = create_tree(list(cond_patt_bases.keys()), min_sup)
+        my_cond_tree, my_head = create_tree(list(cond_patt_bases), min_sup)
         if my_head is not None:
             # Pass header_table[base_pat][1] as node_to_test to update_header
             header_table[base_pat][1] = update_header(
@@ -319,11 +327,10 @@ def mine_tree(
 
 
 if __name__ == "__main__":
-    import doctest
-
-    doctest.testmod()
+    from doctest import testmod
 
-    data_set: list = [
+    testmod()
+    data_set: list[frozenset] = [
         frozenset(["bread", "milk", "cheese"]),
         frozenset(["bread", "milk"]),
         frozenset(["bread", "diapers"]),
@@ -333,7 +340,10 @@ def mine_tree(
         frozenset(["diapers", "cheese"]),
         frozenset(["bread", "milk", "cheese", "diapers"]),
     ]
+    print(f"{len(data_set) = }")
     fp_tree, header_table = create_tree(data_set, min_sup=3)
+    print(f"{fp_tree = }")
+    print(f"{len(header_table) = }")
     freq_items: list = []
     mine_tree(fp_tree, header_table, 3, set(), freq_items)
-    print(freq_items)
+    print(f"{freq_items = }")