ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

fp_growth.py

(7664B)


      1 from __future__ import division, print_function
      2 import numpy as np
      3 import itertools
      4 
      5 
      6 class FPTreeNode():
      7     def __init__(self, item=None, support=1):
      8         # 'Value' of the item
      9         self.item = item
     10         # Number of times the item occurs in a
     11         # transaction
     12         self.support = support
     13         # Child nodes in the FP Growth Tree
     14         self.children = {}
     15 
     16 
     17 class FPGrowth():
     18     """A method for determining frequent itemsets in a transactional database. 
     19     This is done by building a so called FP Growth tree, which can then be mined
     20     to collect the frequent itemsets. More effective than Apriori for large transactional
     21     databases.
     22 
     23     Parameters:
     24     -----------
     25     min_sup: float
     26         The minimum fraction of transactions an itemets needs to
     27         occur in to be deemed frequent
     28     """
     29     def __init__(self, min_sup=0.3):
     30         self.min_sup = min_sup
     31         # The root of the initial FP Growth Tree
     32         self.tree_root = None
     33         # Prefixes of itemsets in the FP Growth Tree
     34         self.prefixes = {}
     35         self.frequent_itemsets = []
     36 
     37     # Count the number of transactions that contains item.
     38     def _calculate_support(self, item, transactions):
     39         count = 0
     40         for transaction in transactions:
     41             if item in transaction:
     42                 count += 1
     43         support = count
     44         return support
     45 
     46 
     47     def _get_frequent_items(self, transactions):
     48         """ Returns a set of frequent items. An item is determined to
     49         be frequent if there are atleast min_sup transactions that contains
     50         it. """
     51         # Get all unique items in the transactions
     52         unique_items = set(
     53             item for transaction in transactions for item in transaction)
     54         items = []
     55         for item in unique_items:
     56             sup = self._calculate_support(item, transactions)
     57             if sup >= self.min_sup:
     58                 items.append([item, sup])
     59         # Sort by support - Highest to lowest
     60         items.sort(key=lambda item: item[1], reverse=True)
     61         frequent_items = [[el[0]] for el in items]
     62         # Only return the items
     63         return frequent_items
     64 
     65     def _insert_tree(self, node, children):
     66         """ Recursive method which adds nodes to the tree. """
     67         if not children:
     68             return
     69         # Create new node as the first item in children list
     70         child_item = children[0]
     71         child = FPTreeNode(item=child_item)
     72         # If parent already contains item => increase the support
     73         if child_item in node.children:
     74             node.children[child.item].support += 1
     75         else:
     76             node.children[child.item] = child
     77 
     78         # Execute _insert_tree on the rest of the children list
     79         # from the new node
     80         self._insert_tree(node.children[child.item], children[1:])
     81 
     82     def _construct_tree(self, transactions, frequent_items=None):
     83         if not frequent_items:
     84             # Get frequent items sorted by support
     85             frequent_items = self._get_frequent_items(transactions)
     86         unique_frequent_items = list(
     87             set(item for itemset in frequent_items for item in itemset))
     88         # Construct the root of the FP Growth tree
     89         root = FPTreeNode()
     90         for transaction in transactions:
     91             # Remove items that are not frequent according to
     92             # unique_frequent_items
     93             transaction = [item for item in transaction if item in unique_frequent_items]
     94             transaction.sort(key=lambda item: frequent_items.index([item]))
     95             self._insert_tree(root, transaction)
     96 
     97         return root
     98 
     99     def print_tree(self, node=None, indent_times=0):
    100         """ Recursive method which prints the FP Growth Tree """
    101         if not node:
    102             node = self.tree_root
    103         indent = "    " * indent_times
    104         print ("%s%s:%s" % (indent, node.item, node.support))
    105         for child_key in node.children:
    106             child = node.children[child_key]
    107             self.print_tree(child, indent_times + 1)
    108 
    109 
    110     def _is_prefix(self, itemset, node):
    111         """ Makes sure that the first item in itemset is a child of node 
    112         and that every following item in itemset is reachable via that path """
    113         for item in itemset:
    114             if not item in node.children:
    115                 return False
    116             node = node.children[item]
    117         return True
    118 
    119 
    120     def _determine_prefixes(self, itemset, node, prefixes=None):
    121         """ Recursive method that adds prefixes to the itemset by traversing the 
    122         FP Growth Tree"""
    123         if not prefixes:
    124             prefixes = []
    125 
    126         # If the current node is a prefix to the itemset
    127         # add the current prefixes value as prefix to the itemset
    128         if self._is_prefix(itemset, node):
    129             itemset_key = self._get_itemset_key(itemset)
    130             if not itemset_key in self.prefixes:
    131                 self.prefixes[itemset_key] = []
    132             self.prefixes[itemset_key] += [{"prefix": prefixes, "support": node.children[itemset[0]].support}]
    133 
    134         for child_key in node.children:
    135             child = node.children[child_key]
    136             # Recursive call with child as new node. Add the child item as potential
    137             # prefix.
    138             self._determine_prefixes(itemset, child, prefixes + [child.item])
    139 
    140 
    141     def _get_itemset_key(self, itemset):
    142         """ Determines the look of the hashmap key for self.prefixes
    143         List of more strings than one gets joined by '-' """
    144         if len(itemset) > 1:
    145             itemset_key = "-".join(itemset)
    146         else:
    147             itemset_key = str(itemset[0])
    148         return itemset_key
    149 
    150     def _determine_frequent_itemsets(self, conditional_database, suffix):
    151         # Calculate new frequent items from the conditional database
    152         # of suffix
    153         frequent_items = self._get_frequent_items(conditional_database)
    154 
    155         cond_tree = None
    156 
    157         if suffix:
    158             cond_tree = self._construct_tree(conditional_database, frequent_items)
    159             # Output new frequent itemset as the suffix added to the frequent
    160             # items
    161             self.frequent_itemsets += [el + suffix for el in frequent_items]
    162 
    163         # Find larger frequent itemset by finding prefixes
    164         # of the frequent items in the FP Growth Tree for the conditional
    165         # database.
    166         self.prefixes = {}
    167         for itemset in frequent_items:
    168             # If no suffix (first run)
    169             if not cond_tree:
    170                 cond_tree = self.tree_root
    171             # Determine prefixes to itemset
    172             self._determine_prefixes(itemset, cond_tree)
    173             conditional_database = []
    174             itemset_key = self._get_itemset_key(itemset)
    175             # Build new conditional database
    176             if itemset_key in self.prefixes:
    177                 for el in self.prefixes[itemset_key]:
    178                     # If support = 4 => add 4 of the corresponding prefix set
    179                     for _ in range(el["support"]):
    180                         conditional_database.append(el["prefix"])
    181                 # Create new suffix
    182                 new_suffix = itemset + suffix if suffix else itemset
    183                 self._determine_frequent_itemsets(conditional_database, suffix=new_suffix)
    184 
    185     def find_frequent_itemsets(self, transactions, suffix=None, show_tree=False):
    186         self.transactions = transactions
    187 
    188         # Build the FP Growth Tree
    189         self.tree_root = self._construct_tree(transactions)
    190         if show_tree:
    191             print ("FP-Growth Tree:")
    192             self.print_tree(self.tree_root)
    193 
    194         self._determine_frequent_itemsets(transactions, suffix=None)
    195 
    196         return self.frequent_itemsets
    197