ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
apriori.py
(7906B)
1 from __future__ import division, print_function
2 import numpy as np
3 import itertools
4
5
6 class Rule():
7 def __init__(self, antecedent, concequent, confidence, support):
8 self.antecedent = antecedent
9 self.concequent = concequent
10 self.confidence = confidence
11 self.support = support
12
13
14 class Apriori():
15 """A method for determining frequent itemsets in a transactional database and
16 also for generating rules for those itemsets.
17
18 Parameters:
19 -----------
20 min_sup: float
21 The minimum fraction of transactions an itemets needs to
22 occur in to be deemed frequent
23 min_conf: float:
24 The minimum fraction of times the antecedent needs to imply
25 the concequent to justify rule
26 """
27 def __init__(self, min_sup=0.3, min_conf=0.81):
28
29 self.min_sup = min_sup
30 self.min_conf = min_conf
31 self.freq_itemsets = None # List of freqeuent itemsets
32 self.transactions = None # List of transactions
33
34 def _calculate_support(self, itemset):
35 count = 0
36 for transaction in self.transactions:
37 if self._transaction_contains_items(transaction, itemset):
38 count += 1
39 support = count / len(self.transactions)
40 return support
41
42
43 def _get_frequent_itemsets(self, candidates):
44 """ Prunes the candidates that are not frequent => returns list with
45 only frequent itemsets """
46 frequent = []
47 # Find frequent items
48 for itemset in candidates:
49 support = self._calculate_support(itemset)
50 if support >= self.min_sup:
51 frequent.append(itemset)
52 return frequent
53
54
55 def _has_infrequent_itemsets(self, candidate):
56 """ True or false depending on the candidate has any
57 subset with size k - 1 that is not in the frequent itemset """
58 k = len(candidate)
59 # Find all combinations of size k-1 in candidate
60 # E.g [1,2,3] => [[1,2],[1,3],[2,3]]
61 subsets = list(itertools.combinations(candidate, k - 1))
62 for t in subsets:
63 # t - is tuple. If size == 1 get the element
64 subset = list(t) if len(t) > 1 else t[0]
65 if not subset in self.freq_itemsets[-1]:
66 return True
67 return False
68
69
70 def _generate_candidates(self, freq_itemset):
71 """ Joins the elements in the frequent itemset and prunes
72 resulting sets if they contain subsets that have been determined
73 to be infrequent. """
74 candidates = []
75 for itemset1 in freq_itemset:
76 for itemset2 in freq_itemset:
77 # Valid if every element but the last are the same
78 # and the last element in itemset1 is smaller than the last
79 # in itemset2
80 valid = False
81 single_item = isinstance(itemset1, int)
82 if single_item and itemset1 < itemset2:
83 valid = True
84 elif not single_item and np.array_equal(itemset1[:-1], itemset2[:-1]) and itemset1[-1] < itemset2[-1]:
85 valid = True
86
87 if valid:
88 # JOIN: Add the last element in itemset2 to itemset1 to
89 # create a new candidate
90 if single_item:
91 candidate = [itemset1, itemset2]
92 else:
93 candidate = itemset1 + [itemset2[-1]]
94 # PRUNE: Check if any subset of candidate have been determined
95 # to be infrequent
96 infrequent = self._has_infrequent_itemsets(candidate)
97 if not infrequent:
98 candidates.append(candidate)
99 return candidates
100
101
102 def _transaction_contains_items(self, transaction, items):
103 """ True or false depending on each item in the itemset is
104 in the transaction """
105 # If items is in fact only one item
106 if isinstance(items, int):
107 return items in transaction
108 # Iterate through list of items and make sure that
109 # all items are in the transaction
110 for item in items:
111 if not item in transaction:
112 return False
113 return True
114
115 def find_frequent_itemsets(self, transactions):
116 """ Returns the set of frequent itemsets in the list of transactions """
117 self.transactions = transactions
118 # Get all unique items in the transactions
119 unique_items = set(item for transaction in self.transactions for item in transaction)
120 # Get the frequent items
121 self.freq_itemsets = [self._get_frequent_itemsets(unique_items)]
122 while(True):
123 # Generate new candidates from last added frequent itemsets
124 candidates = self._generate_candidates(self.freq_itemsets[-1])
125 # Get the frequent itemsets among those candidates
126 frequent_itemsets = self._get_frequent_itemsets(candidates)
127
128 # If there are no frequent itemsets we're done
129 if not frequent_itemsets:
130 break
131
132 # Add them to the total list of frequent itemsets and start over
133 self.freq_itemsets.append(frequent_itemsets)
134
135 # Flatten the array and return every frequent itemset
136 frequent_itemsets = [
137 itemset for sublist in self.freq_itemsets for itemset in sublist]
138 return frequent_itemsets
139
140
141 def _rules_from_itemset(self, initial_itemset, itemset):
142 """ Recursive function which returns the rules where confidence >= min_confidence
143 Starts with large itemset and recursively explores rules for subsets """
144 rules = []
145 k = len(itemset)
146 # Get all combinations of sub-itemsets of size k - 1 from itemset
147 # E.g [1,2,3] => [[1,2],[1,3],[2,3]]
148 subsets = list(itertools.combinations(itemset, k - 1))
149 support = self._calculate_support(initial_itemset)
150 for antecedent in subsets:
151 # itertools.combinations returns tuples => convert to list
152 antecedent = list(antecedent)
153 antecedent_support = self._calculate_support(antecedent)
154 # Calculate the confidence as sup(A and B) / sup(B), if antecedent
155 # is B in an itemset of A and B
156 confidence = float("{0:.2f}".format(support / antecedent_support))
157 if confidence >= self.min_conf:
158 # The concequent is the initial_itemset except for antecedent
159 concequent = [itemset for itemset in initial_itemset if not itemset in antecedent]
160 # If single item => get item
161 if len(antecedent) == 1:
162 antecedent = antecedent[0]
163 if len(concequent) == 1:
164 concequent = concequent[0]
165 # Create new rule
166 rule = Rule(
167 antecedent=antecedent,
168 concequent=concequent,
169 confidence=confidence,
170 support=support)
171 rules.append(rule)
172
173 # If there are subsets that could result in rules
174 # recursively add rules from subsets
175 if k - 1 > 1:
176 rules += self._rules_from_itemset(initial_itemset, antecedent)
177 return rules
178
179 def generate_rules(self, transactions):
180 self.transactions = transactions
181 frequent_itemsets = self.find_frequent_itemsets(transactions)
182 # Only consider itemsets of size >= 2 items
183 frequent_itemsets = [itemset for itemset in frequent_itemsets if not isinstance(
184 itemset, int)]
185 rules = []
186 for itemset in frequent_itemsets:
187 rules += self._rules_from_itemset(itemset, itemset)
188 # Remove empty values
189 return rules
190