BP231021 | SandeeP R
II MCa
ELECTIVE V : DATA MINING TECHNIQUES
UNIT 2 - ASSOCIATION RULES
Problem solving based Association Rule Algorithms
Use any transaction dataset, and apply
i. Frequent set algorithm
ii. Apriori algorithm
iii. Partition Algorithm
iv. Pincer Search
v. Dynamic Itemset Counting
Generate association rules.
Record your inference about one algorithm over the other.
According to you, which algorithm do you consider to be efficient for the dataset you have
chosen.
1. FREQUENT SET ALGORITHM
from itertools import combinations
from collections import defaultdict
def generate_candidates(itemsets, k):
"""Generate candidate itemsets of size k."""
return set(
[frozenset(a.union(b)) for a in itemsets for b in itemsets if len(a.union(b)) == k]
)
def calculate_support(transactions, candidates):
"""Calculate the support count for candidate itemsets."""
support_count = defaultdict(int)
for transaction in transactions:
for candidate in candidates:
if candidate.issubset(transaction):
support_count[candidate] +=
return support_count
def frequent_set_algorithm(transactions, min_support):
"""Frequent Set Algorithm to find all frequent itemsets."""
# Initialize variables
frequent_itemsets = {} # Dictionary to store itemsets with their support
k = 1 # Start with itemsets of size 1
current_itemsets = set(frozenset([item]) for transaction in transactions for item in
transaction)
while current_itemsets:
# Calculate support for the current itemsets
support_count = calculate_support(transactions, current_itemsets)
# Filter itemsets that meet the minimum support
current_frequent = {
itemset: count
for itemset, count in support_count.items()
if count >= min_support
}
# Add the frequent itemsets to the result
frequent_itemsets.update(current_frequent)
# Generate candidates for the next iteration (itemsets of size k+1)
k += 1
current_itemsets = generate_candidates(set(current_frequent.keys()), k)
return frequent_itemsets
# Example usage
if __name__ == "__main__":
# Example transactions
transactions = [
{"milk", "bread", "butter"},
{"beer", "bread", "butter"},
{"milk", "beer", "bread"},
{"milk", "bread", "butter", "beer"},
]
# Minimum support threshold
min_support = 2
# Run the Frequent Set Algorithm
frequent_itemsets = frequent_set_algorithm(transactions, min_support)
# Print the results
print("Frequent Itemsets:")
for itemset, support in frequent_itemsets.items():
print(f"{set(itemset)}: {support}")
OUTPUT :
Frequent Itemsets:
{'butter'}: 3
{'milk'}: 3
{'bread'}: 4
{'beer'}: 3
{'butter', 'bread'}: 3
{'milk', 'butter'}: 2
{'milk', 'bread'}: 3
{'beer', 'butter'}: 2
{'beer', 'bread'}: 3
{'milk', 'beer'}: 2
{'milk', 'butter', 'bread'}: 2
{'beer', 'butter', 'bread'}: 2
{'milk', 'beer', 'bread'}: 2
2. APRIORI ALGORITHM
from itertools import combinations
def generate_candidates(itemsets, k):
"""Generate candidate itemsets of size k."""
return set(
[frozenset(a.union(b)) for a in itemsets for b in itemsets if len(a.union(b)) == k]
)
def calculate_support(transactions, candidates):
"""Calculate support count for candidate itemsets."""
support_count = {itemset: 0 for itemset in candidates}
for transaction in transactions:
for candidate in candidates:
if candidate.issubset(transaction):
support_count[candidate] += 1
return support_count
def apriori(transactions, min_support):
"""Apriori Algorithm to find frequent itemsets."""
# Initialize variables
frequent_itemsets = {} # Store frequent itemsets with their support
k = 1 # Start with 1-itemsets
current_itemsets = set(frozenset([item]) for transaction in transactions for item in
transaction)
while current_itemsets:
# Calculate support for the current itemsets
support_count = calculate_support(transactions, current_itemsets)
# Filter itemsets based on the minimum support threshold
current_frequent = {
itemset: count
for itemset, count in support_count.items()
if count >= min_support
}
# Add the frequent itemsets to the result
frequent_itemsets.update(current_frequent)
# Generate candidates for the next iteration (itemsets of size k+1)
k += 1
current_itemsets = generate_candidates(set(current_frequent.keys()), k)
return frequent_itemsets
# Example usage
if __name__ == "__main__":
# Example transactions
transactions = [
{"milk", "bread", "butter"},
{"beer", "bread", "butter"},
{"milk", "beer", "bread"},
{"milk", "bread", "butter", "beer"},
]
# Minimum support threshold
min_support = 2
# Run the Apriori Algorithm
frequent_itemsets = apriori(transactions, min_support)
# Print the results
print("Frequent Itemsets:")
for itemset, support in frequent_itemsets.items():
print(f"{set(itemset)}: {support}")
OUTPUT :
Frequent Itemsets:
{'butter'}: 3
{'beer'}: 3
{'milk'}: 3
{'bread'}: 4
{'beer', 'butter'}: 2
{'butter', 'bread'}: 3
{'milk', 'beer'}: 2
{'beer', 'bread'}: 3
{'milk', 'butter'}: 2
{'milk', 'bread'}: 3
{'beer', 'butter', 'bread'}: 2
{'milk', 'beer', 'bread'}: 2
{'milk', 'butter', 'bread'}: 2
3. PARTITION ALGORITHM
from itertools import combinations
from collections import defaultdict
# Helper function to generate candidate itemsets
def generate_candidates(itemsets, k):
return set(
[frozenset(a.union(b)) for a in itemsets for b in itemsets if len(a.union(b)) == k]
)
# Helper function to calculate support for itemsets in a partition
def calculate_support(transactions, candidates):
support_count = defaultdict(int)
for transaction in transactions:
for candidate in candidates:
if candidate.issubset(transaction):
support_count[candidate] += 1
return support_count
# Partition Algorithm implementation
def partition_algorithm(transactions, n_partitions, min_support):
# Phase I: Divide the transactions into n partitions
partition_size = len(transactions) // n_partitions
partitions = [
transactions[i * partition_size:(i + 1) * partition_size]
for i in range(n_partitions)
]
if len(transactions) % n_partitions != 0:
partitions[-1].extend(transactions[n_partitions * partition_size :]) # Handle leftover
# Phase I: Generate local frequent itemsets for each partition
local_frequent_itemsets = []
for partition in partitions:
k=1
current_itemsets = set(frozenset([item]) for transaction in partition for item in
transaction)
partition_frequent = {}
while current_itemsets:
support_count = calculate_support(partition, current_itemsets)
frequent_itemsets = {
itemset: count
for itemset, count in support_count.items()
if count >= min_support
}
if not frequent_itemsets:
break
partition_frequent.update(frequent_itemsets)
k += 1
current_itemsets = generate_candidates(set(frequent_itemsets.keys()), k)
local_frequent_itemsets.append(partition_frequent)
# Merge Phase: Combine local frequent itemsets across partitions
global_candidates = defaultdict(int)
for partition_frequent in local_frequent_itemsets:
for itemset, count in partition_frequent.items():
global_candidates[itemset] += count
# Phase II: Validate global candidates with the entire dataset
final_support_count = calculate_support(transactions, set(global_candidates.keys()))
final_frequent_itemsets = {
itemset: count
for itemset, count in final_support_count.items()
if count >= min_support
}
return final_frequent_itemsets
# Example usage
if __name__ == "__main__":
# Example transactions
transactions = [
{"milk", "bread", "butter"},
{"beer", "bread", "butter"},
{"milk", "beer", "bread"},
{"milk", "bread", "butter", "beer"},
]
# Number of partitions
n_partitions = 2
# Minimum support threshold
min_support = 2
# Run the Partition Algorithm
frequent_itemsets = partition_algorithm(transactions, n_partitions, min_support)
# Print the results
print("Frequent Itemsets:", frequent_itemsets)
OUTPUT :
Frequent Itemsets: {frozenset({'butter', 'bread'}): 3, frozenset({'butter'}): 3,
frozenset({'milk'}): 3, frozenset({'bread'}): 4, frozenset({'milk', 'bread'}): 3,
frozenset({'beer', 'bread'}): 3, frozenset({'beer'}): 3, frozenset({'milk', 'beer', 'bread'}): 2,
frozenset({'milk', 'beer'}): 2}
4. PINCER SEARCH ALGORITHM
from itertools import combinations
from collections import defaultdict
def generate_candidates(itemsets, k):
"""Generate candidate itemsets of size k."""
return set(
[frozenset(a.union(b)) for a in itemsets for b in itemsets if len(a.union(b)) == k]
)
def calculate_support(transactions, candidates):
"""Calculate the support of candidates in the transactions."""
support_count = defaultdict(int)
for transaction in transactions:
for candidate in candidates:
if candidate.issubset(transaction):
support_count[candidate] += 1
return support_count
def pincer_search(transactions, min_support):
"""Pincer-Search Algorithm implementation."""
# Initialize variables
frequent_itemsets = {} # Store frequent itemsets with their support
infrequent_itemsets = set() # Store infrequent itemsets
global_support = {} # Keep track of support for all itemsets
# Generate initial 1-item candidates
k=1
current_itemsets = set(frozenset([item]) for transaction in transactions for item in
transaction)
while current_itemsets:
# Calculate support for current candidates
support_count = calculate_support(transactions, current_itemsets)
global_support.update(support_count)
# Split into frequent and infrequent itemsets
current_frequent = {
itemset: count
for itemset, count in support_count.items()
if count >= min_support
}
frequent_itemsets.update(current_frequent)
infrequent_itemsets.update(
itemset for itemset, count in support_count.items() if count < min_support
)
# Check for termination: if no frequent itemsets, stop
if not current_frequent:
break
# Generate next candidates using frequent itemsets
k += 1
current_itemsets = generate_candidates(set(current_frequent.keys()), k)
# Prune candidates containing infrequent subsets
current_itemsets = {
candidate
for candidate in current_itemsets
if not any(subset in infrequent_itemsets for subset in combinations(candidate, k - 1))
}
return frequent_itemsets
# Example usage
if __name__ == "__main__":
# Example transactions
transactions = [
{"milk", "bread", "butter"},
{"beer", "bread", "butter"},
{"milk", "beer", "bread"},
{"milk", "bread", "butter", "beer"},
]
# Minimum support threshold
min_support = 2
# Run the Pincer-Search Algorithm
frequent_itemsets = pincer_search(transactions, min_support)
# Print the results
print("Frequent Itemsets:")
for itemset, support in frequent_itemsets.items():
print(f"{set(itemset)}: {support}")
OUTPUT :
Frequent Itemsets:
{'butter'}: 3
{'milk'}: 3
{'bread'}: 4
{'beer'}: 3
{'butter', 'bread'}: 3
{'milk', 'butter'}: 2
{'milk', 'bread'}: 3
{'beer', 'butter'}: 2
{'beer', 'bread'}: 3
{'milk', 'beer'}: 2
{'milk', 'butter', 'bread'}: 2
{'beer', 'butter', 'bread'}: 2
{'milk', 'beer', 'bread'}:
5. DYNAMIC ITEMSET COUNTING
from itertools import combinations
def calculate_support(transactions, candidates):
"""Calculate the support count for a set of candidate itemsets."""
support_count = {itemset: 0 for itemset in candidates}
for transaction in transactions:
for candidate in candidates:
if candidate.issubset(transaction):
support_count[candidate] += 1
return support_count
def dic_algorithm(transactions, min_support):
"""Dynamic Itemset Counting (DIC) Algorithm."""
# Initialize variables
frequent_itemsets = {} # Store frequent itemsets with their support
k = 1 # Current size of itemsets
active_candidates = set(frozenset([item]) for transaction in transactions for item in
transaction)
inactive_candidates = set() # Itemsets to be activated later
while active_candidates:
# Calculate support for active candidates
support_count = calculate_support(transactions, active_candidates)
# Filter frequent itemsets from active candidates
current_frequent = {
itemset: count
for itemset, count in support_count.items()
if count >= min_support
}
# Add the frequent itemsets to the results
frequent_itemsets.update(current_frequent)
# Move non-frequent active candidates to inactive
inactive_candidates.update(
itemset for itemset, count in support_count.items() if count < min_support
)
# Generate new candidates to activate dynamically
new_candidates = set()
for itemset in current_frequent:
for other in frequent_itemsets.keys():
if len(itemset.union(other)) == k + 1:
new_candidate = itemset.union(other)
if all(frozenset(subset) in frequent_itemsets for subset in
combinations(new_candidate, k)):
new_candidates.add(new_candidate)
# Activate new candidates
active_candidates = new_candidates
k += 1
return frequent_itemsets
# Example usage
if __name__ == "__main__":
# Example transactions
transactions = [
{"milk", "bread", "butter"},
{"beer", "bread", "butter"},
{"milk", "beer", "bread"},
{"milk", "bread", "butter", "beer"},
]
# Minimum support threshold
min_support = 2
# Run the DIC Algorithm
frequent_itemsets = dic_algorithm(transactions, min_support)
# Print the results
print("Frequent Itemsets:")
for itemset, support in frequent_itemsets.items():
print(f"{set(itemset)}: {support}")
OUTPUT :
Frequent Itemsets:
{'butter'}: 3
{'beer'}: 3
{'milk'}: 3
{'bread'}: 4
{'butter', 'bread'}: 3
{'beer', 'butter'}: 2
{'beer', 'bread'}: 3
{'milk', 'butter'}: 2
{'milk', 'beer'}: 2
{'milk', 'bread'}: 3
{'beer', 'butter', 'bread'}: 2
{'milk', 'beer', 'bread'}: 2
{'milk', 'butter', 'bread'}: 2