ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
fp_growth.py
(7664B)
1 from __future__ import division, print_function
2 import numpy as np
3 import itertools
4
5
6 class FPTreeNode():
7 def __init__(self, item=None, support=1):
8 # 'Value' of the item
9 self.item = item
10 # Number of times the item occurs in a
11 # transaction
12 self.support = support
13 # Child nodes in the FP Growth Tree
14 self.children = {}
15
16
17 class FPGrowth():
18 """A method for determining frequent itemsets in a transactional database.
19 This is done by building a so called FP Growth tree, which can then be mined
20 to collect the frequent itemsets. More effective than Apriori for large transactional
21 databases.
22
23 Parameters:
24 -----------
25 min_sup: float
26 The minimum fraction of transactions an itemets needs to
27 occur in to be deemed frequent
28 """
29 def __init__(self, min_sup=0.3):
30 self.min_sup = min_sup
31 # The root of the initial FP Growth Tree
32 self.tree_root = None
33 # Prefixes of itemsets in the FP Growth Tree
34 self.prefixes = {}
35 self.frequent_itemsets = []
36
37 # Count the number of transactions that contains item.
38 def _calculate_support(self, item, transactions):
39 count = 0
40 for transaction in transactions:
41 if item in transaction:
42 count += 1
43 support = count
44 return support
45
46
47 def _get_frequent_items(self, transactions):
48 """ Returns a set of frequent items. An item is determined to
49 be frequent if there are atleast min_sup transactions that contains
50 it. """
51 # Get all unique items in the transactions
52 unique_items = set(
53 item for transaction in transactions for item in transaction)
54 items = []
55 for item in unique_items:
56 sup = self._calculate_support(item, transactions)
57 if sup >= self.min_sup:
58 items.append([item, sup])
59 # Sort by support - Highest to lowest
60 items.sort(key=lambda item: item[1], reverse=True)
61 frequent_items = [[el[0]] for el in items]
62 # Only return the items
63 return frequent_items
64
65 def _insert_tree(self, node, children):
66 """ Recursive method which adds nodes to the tree. """
67 if not children:
68 return
69 # Create new node as the first item in children list
70 child_item = children[0]
71 child = FPTreeNode(item=child_item)
72 # If parent already contains item => increase the support
73 if child_item in node.children:
74 node.children[child.item].support += 1
75 else:
76 node.children[child.item] = child
77
78 # Execute _insert_tree on the rest of the children list
79 # from the new node
80 self._insert_tree(node.children[child.item], children[1:])
81
82 def _construct_tree(self, transactions, frequent_items=None):
83 if not frequent_items:
84 # Get frequent items sorted by support
85 frequent_items = self._get_frequent_items(transactions)
86 unique_frequent_items = list(
87 set(item for itemset in frequent_items for item in itemset))
88 # Construct the root of the FP Growth tree
89 root = FPTreeNode()
90 for transaction in transactions:
91 # Remove items that are not frequent according to
92 # unique_frequent_items
93 transaction = [item for item in transaction if item in unique_frequent_items]
94 transaction.sort(key=lambda item: frequent_items.index([item]))
95 self._insert_tree(root, transaction)
96
97 return root
98
99 def print_tree(self, node=None, indent_times=0):
100 """ Recursive method which prints the FP Growth Tree """
101 if not node:
102 node = self.tree_root
103 indent = " " * indent_times
104 print ("%s%s:%s" % (indent, node.item, node.support))
105 for child_key in node.children:
106 child = node.children[child_key]
107 self.print_tree(child, indent_times + 1)
108
109
110 def _is_prefix(self, itemset, node):
111 """ Makes sure that the first item in itemset is a child of node
112 and that every following item in itemset is reachable via that path """
113 for item in itemset:
114 if not item in node.children:
115 return False
116 node = node.children[item]
117 return True
118
119
120 def _determine_prefixes(self, itemset, node, prefixes=None):
121 """ Recursive method that adds prefixes to the itemset by traversing the
122 FP Growth Tree"""
123 if not prefixes:
124 prefixes = []
125
126 # If the current node is a prefix to the itemset
127 # add the current prefixes value as prefix to the itemset
128 if self._is_prefix(itemset, node):
129 itemset_key = self._get_itemset_key(itemset)
130 if not itemset_key in self.prefixes:
131 self.prefixes[itemset_key] = []
132 self.prefixes[itemset_key] += [{"prefix": prefixes, "support": node.children[itemset[0]].support}]
133
134 for child_key in node.children:
135 child = node.children[child_key]
136 # Recursive call with child as new node. Add the child item as potential
137 # prefix.
138 self._determine_prefixes(itemset, child, prefixes + [child.item])
139
140
141 def _get_itemset_key(self, itemset):
142 """ Determines the look of the hashmap key for self.prefixes
143 List of more strings than one gets joined by '-' """
144 if len(itemset) > 1:
145 itemset_key = "-".join(itemset)
146 else:
147 itemset_key = str(itemset[0])
148 return itemset_key
149
150 def _determine_frequent_itemsets(self, conditional_database, suffix):
151 # Calculate new frequent items from the conditional database
152 # of suffix
153 frequent_items = self._get_frequent_items(conditional_database)
154
155 cond_tree = None
156
157 if suffix:
158 cond_tree = self._construct_tree(conditional_database, frequent_items)
159 # Output new frequent itemset as the suffix added to the frequent
160 # items
161 self.frequent_itemsets += [el + suffix for el in frequent_items]
162
163 # Find larger frequent itemset by finding prefixes
164 # of the frequent items in the FP Growth Tree for the conditional
165 # database.
166 self.prefixes = {}
167 for itemset in frequent_items:
168 # If no suffix (first run)
169 if not cond_tree:
170 cond_tree = self.tree_root
171 # Determine prefixes to itemset
172 self._determine_prefixes(itemset, cond_tree)
173 conditional_database = []
174 itemset_key = self._get_itemset_key(itemset)
175 # Build new conditional database
176 if itemset_key in self.prefixes:
177 for el in self.prefixes[itemset_key]:
178 # If support = 4 => add 4 of the corresponding prefix set
179 for _ in range(el["support"]):
180 conditional_database.append(el["prefix"])
181 # Create new suffix
182 new_suffix = itemset + suffix if suffix else itemset
183 self._determine_frequent_itemsets(conditional_database, suffix=new_suffix)
184
185 def find_frequent_itemsets(self, transactions, suffix=None, show_tree=False):
186 self.transactions = transactions
187
188 # Build the FP Growth Tree
189 self.tree_root = self._construct_tree(transactions)
190 if show_tree:
191 print ("FP-Growth Tree:")
192 self.print_tree(self.tree_root)
193
194 self._determine_frequent_itemsets(transactions, suffix=None)
195
196 return self.frequent_itemsets
197