AI & ML Lab
Experiment No:01
1.Pandas Library
a) Write a python program to implement Pandas Series with labels.
Source Code:
import pandas as pd
# Create a Pandas Series with labels
data = [10, 20, 30, 40]
labels = ['A', 'B', 'C', 'D']
series = pd.Series(data, index=labels)
print("Pandas Series with Labels:")
print(series)
OUTPUT:
Pandas Series with Labels:
A 10
B 20
C 30
D 40
dtype: int64
b) Create a Pandas Series from a Dictionary
CODE:
import pandas as pd
# Create a Pandas Series from a dictionary
data_dict = {'A': 10, 'B': 20, 'C': 30, 'D': 40}
series_from_dict = pd.Series(data_dict)
print("Pandas Series from Dictionary:")
print(series_from_dict)
output:
Pandas Series from Dictionary:
A 10
B 20
C 30
D 40
dtype: int64
c) Creating a Pandas DataFrame
import pandas as pd
# Create a Pandas DataFrame
data = {
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [24, 27, 22],
'City': ['New York', 'Los Angeles', 'Chicago']
df = pd.DataFrame(data)
print("Pandas DataFrame:")
print(df)
output:
Pandas DataFrame:
Name Age City
0 Alice 24 New York
1 Bob 27 Los Angeles
2 Charlie 22 Chicago
d) Pandas Methods: describe(), head(), tail(), info()
import pandas as pd
# Creating a sample DataFrame
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
'Age': [24, 27, 22, 23, 25],
'Salary': [50000, 55000, 60000, 45000, 70000]
df = pd.DataFrame(data)
# i) describe() - Summary statistics of the DataFrame
print("\nSummary Statistics (describe):")
print(df.describe())
# ii) head() - First 3 rows
print("\nFirst 3 Rows (head):")
print(df.head(3))
# iii) tail() - Last 3 rows
print("\nLast 3 Rows (tail):")
print(df.tail(3))
# iv) info() - DataFrame structure info
print("\nDataFrame Info (info):")
print(df.info())
OUTPUT:
Summary Statistics (describe):
Age Salary
count 5.000000 5.000000
mean 24.200000 56000.000000
std 1.832882 9241.473013
min 22.000000 45000.000000
25% 23.000000 50000.000000
50% 24.000000 55000.000000
75% 25.000000 60000.000000
max 27.000000 70000.000000
First 3 Rows (head):
Name Age Salary
0 Alice 24 50000
1 Bob 27 55000
2 Charlie 22 60000
Last 3 Rows (tail):
Name Age Salary
2 Charlie 22 60000
3 David 23 45000
4 Eva 25 70000
DataFrame Info (info):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 5 non-null object
1 Age 5 non-null int64
2 Salary 5 non-null int64
dtypes: int64(2), object(1)
memory usage: 203.0+ bytes
EXPERIMENT-2
2.Pandas Library: Visualization
a) Write a program which use pandas inbuilt visualization to plot following
graphs:
i. Bar plots ii. Histograms iii. Line plots iv. Scatter plots
Pandas provides a simple interface for creating various types of plots directly
using the .plot() method, which internally utilizes the matplotlib library for
visualization. Here's a program demonstrating how to generate different types of
plots (bar plots, histograms, line plots, scatter plots).
import pandas as pd
import matplotlib.pyplot as plt
# Sample DataFrame
data = {
'Category': ['A', 'B', 'C', 'D', 'E'],
'Values': [23, 45, 56, 78, 33],
'Age': [24, 27, 22, 35, 29],
'Salary': [50000, 55000, 60000, 45000, 70000]
# Creating a DataFrame
df = pd.DataFrame(data)
# i. Bar Plot
plt.figure(figsize=(10, 6))
df.plot(kind='bar', x='Category', y='Values', color='skyblue', legend=False)
plt.title('Bar Plot: Category vs Values')
plt.xlabel('Category')
plt.ylabel('Values')
plt.show()
# ii. Histogram
plt.figure(figsize=(10, 6))
df['Salary'].plot(kind='hist', bins=5, color='green', edgecolor='black')
plt.title('Histogram: Salary Distribution')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.show()
# iii. Line Plot
plt.figure(figsize=(10, 6))
df.plot(kind='line', x='Category', y='Salary', color='purple', marker='o')
plt.title('Line Plot: Category vs Salary')
plt.xlabel('Category')
plt.ylabel('Salary')
plt.show()
# iv. Scatter Plot
plt.figure(figsize=(10, 6))
df.plot(kind='scatter', x='Age', y='Salary', color='red')
plt.title('Scatter Plot: Age vs Salary')
plt.xlabel('Age')
plt.ylabel('Salary')
plt.show()
Output (For Each Plot):
Bar Plot:
o A bar chart showing the Category on the x-axis and Values on the
y-axis.
Histogram:
o A histogram showing the distribution of salary values across the
bins.
Line Plot:
o A line plot showing the trend of salaries across categories.
Scatter Plot:
o A scatter plot showing the relationship between age and salary.
Experiment No:03
3.Write a Program to Implement Breadth First Search using Python.
Breadth-First Search (BFS) Algorithm Implementation in Python
Breadth-First Search (BFS) is a graph traversal algorithm used to explore all the
nodes of a graph level by level. It starts from a source node and explores all its
neighboring nodes before moving on to the next level of neighbors. BFS uses a
queue data structure to keep track of nodes to visit next.
Here's how we can implement BFS in Python using an adjacency list
representation of the graph.
from collections import deque
# Graph represented as an adjacency list
class Graph:
def __init__(self):
self.graph = {}
# Add an edge to the graph
def add_edge(self, u, v):
if u not in self.graph:
self.graph[u] = []
if v not in self.graph:
self.graph[v] = []
self.graph[u].append(v)
self.graph[v].append(u) # Uncomment this line for an undirected graph
# BFS implementation
def bfs(self, start):
# A set to keep track of visited nodes
visited = set()
# Create a queue for BFS
queue = deque([start])
# Mark the starting node as visited
visited.add(start)
while queue:
# Dequeue a vertex from the queue
node = queue.popleft()
# Print the current node
print(node, end=" ")
# Traverse all the neighbors of the node
for neighbor in self.graph[node]:
if neighbor not in visited:
visited.add(neighbor)
queue.append(neighbor)
# Driver Code
if __name__ == "__main__":
# Create a graph
g = Graph()
# Add edges to the graph (undirected)
g.add_edge(0, 1)
g.add_edge(0, 2)
g.add_edge(1, 3)
g.add_edge(1, 4)
g.add_edge(2, 5)
g.add_edge(2, 6)
print("Breadth-First Search (BFS) starting from node 0:")
g.bfs(0)
Output:
sql
Copy code
Breadth-First Search (BFS) starting from node 0:
0123456
EXPERIMENT-4
4. Write a program to implement Best First Searching Algorithm
from collections import deque
def best_first_search(start, goal, graph, heuristic):
"""
Implements the Best First Search algorithm.
Args:
start: The starting node.
goal: The goal node.
graph: A dictionary representing the graph,
where keys are nodes and values are lists of
(neighbor, cost) tuples.
heuristic: A function that estimates the cost from a node to the goal.
Returns:
A tuple containing:
- The path from start to goal as a list of nodes.
- The total cost of the path.
- None if no path is found.
"""
frontier = deque([(start, 0)]) # Priority queue (using a deque)
explored = set()
parent = {} # To reconstruct the path
while frontier:
current, current_cost = frontier.popleft()
explored.add(current)
if current == goal:
path = reconstruct_path(parent, start, goal)
return path, current_cost
for neighbor, cost in graph.get(current, []):
if neighbor not in explored:
total_cost = current_cost + cost
priority = total_cost + heuristic(neighbor, goal) # Priority based on cost +
heuristic
frontier.append((neighbor, total_cost))
parent[neighbor] = current
return None, None # No path found
def reconstruct_path(parent, start, goal):
"""
Reconstructs the path from start to goal using the parent dictionary.
"""
path = [goal]
while goal != start:
goal = parent[goal]
path.insert(0, goal)
return path
# Example usage:
graph = {
'A': [('B', 2), ('C', 1)],
'B': [('D', 3)],
'C': [('D', 1), ('E', 2)],
'D': [('F', 3)],
'E': [('F', 1)],
'F': []
}
# Define a heuristic function (e.g., Manhattan distance)
def heuristic(node, goal):
# For this example, we'll use a simple heuristic:
# 0 for nodes adjacent to the goal, 1 otherwise
if node in ['E', 'D']:
return 0
else:
return 1
start_node = 'A'
goal_node = 'F'
path, cost = best_first_search(start_node, goal_node, graph, heuristic)
if path:
print("Path found:", path)
print("Total cost:", cost)
else:
print("No path found.")
OUTPUT:
Path found: ['A', 'C', 'E', 'F']
Total cost: 8
EXPERIMENT-5
5.Write a Program to Implement Depth First Search using Python.
Code:
def dfs(graph, start):
"""
Performs Depth First Search (DFS) on a graph.
Args:
graph: A dictionary representing the graph,
where keys are nodes and values are lists of neighbors.
start: The starting node for the DFS traversal.
Returns:
A list of nodes visited in the DFS order.
"""
visited = set()
result = []
def dfs_recursive(node):
if node not in visited:
visited.add(node)
result.append(node)
for neighbor in graph.get(node, []):
dfs_recursive(neighbor)
dfs_recursive(start)
return result
# Example usage:
graph = {
'A': ['B', 'C'],
'B': ['D', 'E'],
'C': ['F'],
'D': [],
'E': ['F'],
'F': []
start_node = 'A'
dfs_order = dfs(graph, start_node)
print("DFS traversal:", dfs_order)
OUTPUT:
DFS traversal: ['A', 'B', 'D', 'E', 'F', 'C']
EXPERIMENT-6
6.Write a program to implement the Heuristic Search
CODE:
from queue import PriorityQueue
def heuristic_search(start, goal, graph, heuristic):
"""
Performs a general heuristic search on a graph.
Args:
start: The starting node.
goal: The goal node.
graph: A dictionary representing the graph,
where keys are nodes and values are lists of
(neighbor, cost) tuples.
heuristic: A function that estimates the cost from a node to the goal.
Returns:
A tuple containing:
- The path from start to goal as a list of nodes.
- The total cost of the path.
- None if no path is found.
"""
frontier = PriorityQueue()
frontier.put((0, start)) # Priority queue: (priority, node)
came_from = {}
cost_so_far = {}
came_from[start] = None
cost_so_far[start] = 0
while not frontier.empty():
_, current = frontier.get()
if current == goal:
return reconstruct_path(came_from, start, goal), cost_so_far[goal]
for next, cost in graph.get(current, []):
new_cost = cost_so_far[current] + cost
if next not in cost_so_far or new_cost < cost_so_far[next]:
cost_so_far[next] = new_cost
priority = new_cost + heuristic(next, goal)
frontier.put((priority, next))
came_from[next] = current
return None, None # No path found
def reconstruct_path(came_from, start, goal):
"""
Reconstructs the path from start to goal using the came_from dictionary.
"""
path = [goal]
while goal != start:
goal = came_from[goal]
path.insert(0, goal)
return path
# Example usage:
graph = {
'A': [('B', 2), ('C', 1)],
'B': [('D', 3)],
'C': [('D', 1), ('E', 2)],
'D': [('F', 3)],
'E': [('F', 1)],
'F': []
# Define a heuristic function (e.g., Manhattan distance)
def heuristic(node, goal):
# For this example, we'll use a simple heuristic:
# 0 for nodes adjacent to the goal, 1 otherwise
if node in ['E', 'D']:
return 0
else:
return 1
start_node = 'A'
goal_node = 'F'
path, cost = heuristic_search(start_node, goal_node, graph, heuristic)
if path:
print("Path found:", " -> ".join(path))
print("Total cost:", cost)
else:
print("No path found.")
OUTPUT:
Path found: A -> C -> E -> F
Total cost: 4
EXPERIMENT-7
7.AIM: Write a python program to implement A* and AO* algorithm. (Ex: find
the shortest path)
from queue import PriorityQueue
def a_star_search(start, goal, graph, heuristic):
"""
Implements the A* search algorithm.
Args:
start: The starting node.
goal: The goal node.
graph: A dictionary representing the graph,
where keys are nodes and values are lists of
(neighbor, cost) tuples.
heuristic: A function that estimates the cost from a node to the goal.
Returns:
A tuple containing:
- The path from start to goal as a list of nodes.
- The total cost of the path.
- None if no path is found.
"""
frontier = PriorityQueue()
frontier.put((0, start)) # Priority queue: (f_score, node)
came_from = {}
cost_so_far = {}
came_from[start] = None
cost_so_far[start] = 0
while not frontier.empty():
_, current = frontier.get()
if current == goal:
return reconstruct_path(came_from, start, goal), cost_so_far[goal]
for next, cost in graph.get(current, []):
new_cost = cost_so_far[current] + cost
if next not in cost_so_far or new_cost < cost_so_far[next]:
cost_so_far[next] = new_cost
priority = new_cost + heuristic(next, goal)
frontier.put((priority, next))
came_from[next] = current
return None, None # No path found
def ao_star_search(start, goal, graph, heuristic):
"""
Implements the AO* search algorithm (simplified version).
Note: This implementation is a simplified version and may not handle all
edge cases or dynamic environments effectively.
Args:
start: The starting node.
goal: The goal node.
graph: A dictionary representing the graph,
where keys are nodes and values are lists of
(neighbor, cost) tuples.
heuristic: A function that estimates the cost from a node to the goal.
Returns:
A tuple containing:
- The path from start to goal as a list of nodes.
- The total cost of the path.
- None if no path is found.
"""
# Simplified version, assumes initial graph is correct
return a_star_search(start, goal, graph, heuristic)
def reconstruct_path(came_from, start, goal):
"""
Reconstructs the path from start to goal using the came_from dictionary.
"""
path = [goal]
while goal != start:
goal = came_from[goal]
path.insert(0, goal)
return path
# Example usage:
graph = {
'A': [('B', 2), ('C', 1)],
'B': [('D', 3)],
'C': [('D', 1), ('E', 2)],
'D': [('F', 3)],
'E': [('F', 1)],
'F': []
}
# Define a heuristic function (e.g., Manhattan distance)
def heuristic(node, goal):
# For this example, we'll use a simple heuristic:
# 0 for nodes adjacent to the goal, 1 otherwise
if node in ['E', 'D']:
return 0
else:
return 1
start_node = 'A'
goal_node = 'F'
print("A* Search:")
a_star_path, a_star_cost = a_star_search(start_node, goal_node, graph,
heuristic)
if a_star_path:
print("Path found:", " -> ".join(a_star_path))
print("Total cost:", a_star_cost)
else:
print("No path found.")
print("\nAO* Search (Simplified):")
ao_star_path, ao_star_cost = ao_star_search(start_node, goal_node, graph,
heuristic)
if ao_star_path:
print("Path found:", " -> ".join(ao_star_path))
print("Total cost:", ao_star_cost)
else:
print("No path found.")
output:
A* Search:
Path found: A -> C -> E -> F
Total cost: 4
AO* Search (Simplified):
Path found: A -> C -> E -> F
Total cost: 4
EXPERIMENT-8
8.Apply the following Pre-processing techniques for a given dataset.
a. Attribute selection b. Handling Missing Values c. Discretization
d. Elimination of Outliers
1. Attribute Selection
Purpose: Select the most relevant features from the dataset to improve
model performance and reduce dimensionality.
Techniques:
o Filter Methods: Evaluate the relevance of features independently
of the learning algorithm (e.g., correlation analysis, chi-square test,
information gain).
o Wrapper Methods: Evaluate the relevance of features based on
the performance of the learning algorithm (e.g., forward selection,
backward elimination, recursive feature elimination).
o Embedded Methods: Integrate feature selection into the learning
algorithm itself (e.g., L1 regularization in Lasso regression).
2. Handling Missing Values
Purpose: Replace missing values with appropriate estimates to avoid
data loss and improve model accuracy.
Techniques:
o Deletion: Remove rows or columns with missing values (can lead
to significant data loss).
o Imputation:
Mean/Median/Mode Imputation: Replace missing values
with the mean, median, or mode of the respective feature.
K-Nearest Neighbors (KNN) Imputation: Impute missing
values based on the values of k-nearest neighbors.
Multiple Imputation: Create multiple plausible imputations
for each missing value and average the results.
3. Discretization
Purpose: Convert continuous variables into discrete intervals. This can
improve model performance, handle noisy data, and improve data
visualization.
Techniques:
o Equal-width binning: Divide the data into intervals of equal
width.
o Equal-depth binning: Divide the data into intervals containing an
equal number of data points.
o Entropy-based discretization: Determine cut-points based on
information gain or entropy.
4. Elimination of Outliers
Purpose: Remove data points that deviate significantly from the rest of
the data, which can negatively impact model performance and bias
results.
Techniques:
o Z-score method: Calculate the Z-score for each data point and
remove those exceeding a certain threshold (e.g., Z-score > 3).
o Interquartile Range (IQR) method: Identify outliers based on the
IQR, which is the difference between the 75th and 25th percentiles.
o Visualization: Use box plots or scatter plots to visually identify
outliers.
Example using Python (with scikit-learn):
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import IsolationForest
# Load your dataset (replace 'your_data.csv' with your actual file)
data = pd.read_csv('your_data.csv')
# 1. Attribute Selection (Example: Using correlation)
# Calculate correlation matrix
corr_matrix = data.corr()
# Select features with high correlation with the target variable (replace
'target_column' with your target column)
selected_features =
corr_matrix['target_column'].abs().sort_values(ascending=False)[1:]
# Select features with correlation above a threshold (e.g., 0.5)
selected_features = selected_features[selected_features > 0.5].index.tolist()
data = data[selected_features]
# 2. Handling Missing Values (Example: Mean imputation)
imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(data)
# 3. Discretization (Example: Equal-width binning)
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
data_discretized = discretizer.fit_transform(data_imputed)
# 4. Outlier Detection (Example: Isolation Forest)
outlier_detector = IsolationForest(contamination=0.05)
outlier_detector.fit(data_discretized)
outliers = outlier_detector.predict(data_discretized) == -1
# Remove outliers
data_clean = data_discretized[~outliers]
# Convert back to pandas DataFrame (optional)
data_clean = pd.DataFrame(data_clean, columns=data.columns)
print("Cleaned and preprocessed data:")
print(data_clean)
EXPERIMENT-9
9.AIM:
Apply KNN algorithm for classification and regression
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
# Classification Example: Iris Dataset
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
# Make predictions
y_pred = knn.predict(X_test)
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print("Classification Accuracy:", accuracy)
# Regression Example: Simple Linear Regression
np.random.seed(0)
X = np.random.rand(100, 1)
y = 3 + 2 * X + np.random.randn(100, 1)
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Train KNN regressor
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
# Make predictions
y_pred = knn.predict(X_test)
# Evaluate model
mse = mean_squared_error(y_test, y_pred)
print("Regression MSE:", mse)
# Plot regression line
plt.scatter(X_test, y_test, label="Actual")
plt.scatter(X_test, y_pred, label="Predicted")
plt.legend()
plt.show()
EXPERIMENT-10
10.AIM:Demonstrate decision tree algorithm for a classification problem and
perform parameter tuning for better results
CODE:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd
# Load your dataset (replace 'your_data.csv' with your actual file)
data = pd.read_csv('your_data.csv')
# Separate features (X) and target variable (y)
X = data.drop('target_column', axis=1) # Replace 'target_column' with your
target column
y = data['target_column']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Create a Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
# Define hyperparameter grid for tuning
param_grid = {
'criterion': ['gini', 'entropy'],
'max_depth': [None, 5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid,
cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)
# Train the model with the best hyperparameters
best_clf = DecisionTreeClassifier(**best_params, random_state=42)
best_clf.fit(X_train, y_train)
# Make predictions
y_pred = best_clf.predict(X_test)
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
EXPERIMENT-11
Apply Random Forest algorithm for classification and regression
from sklearn.ensemble import RandomForestClassifier,
RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
# Sample data (replace with your actual data)
# For classification
X_clf = [[0, 0], [1, 1], [0, 1], [1, 0]]
y_clf = [0, 1, 1, 0]
# For regression
X_reg = [[0, 0], [1, 1], [2, 2], [3, 3]]
y_reg = [0, 1, 2, 3]
# Split data into training and testing sets
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf,
test_size=0.2, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg,
y_reg, test_size=0.2, random_state=42)
# Create and train the models
# Classification
clf = RandomForestClassifier(n_estimators=100, random_state=42) # Adjust
n_estimators as needed
clf.fit(X_train_clf, y_train_clf)
# Regression
reg = RandomForestRegressor(n_estimators=100, random_state=42) # Adjust
n_estimators as needed
reg.fit(X_train_reg, y_train_reg)
# Make predictions
y_pred_clf = clf.predict(X_test_clf)
y_pred_reg = reg.predict(X_test_reg)
# Evaluate performance
# Classification
accuracy = accuracy_score(y_test_clf, y_pred_clf)
print("Classification Accuracy:", accuracy)
# Regression
mse = mean_squared_error(y_test_reg, y_pred_reg)
print("Regression Mean Squared Error:", mse)
EXPERIMENT-12
12.Demonstrate Naïve Bayes Classification algorithm.
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
# Sample data (replace with your actual data)
X = [[1, 2], [2, 3], [3, 1], [4, 3], [5, 2]]
y = [0, 0, 1, 1, 0]
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Create and train the Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)
# Make predictions
y_pred = gnb.predict(X_test)
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
EXPERIMENT-13
13.Apply Support Vector algorithm for classification.
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# Sample data (replace with your actual data)
X = [[1, 2], [2, 3], [3, 1], [4, 3], [5, 2]]
y = [0, 0, 1, 1, 0]
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Create and train the SVM classifier
svm_classifier = SVC(kernel='linear') # Use 'linear' kernel for linear SVM
svm_classifier.fit(X_train, y_train)
# Make predictions
y_pred = svm_classifier.predict(X_test)
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
EXPERIMENT-14
14.Implement the K-means algorithm and apply it to the data you selected.
Evaluate performance by measuring the sum of the Euclidean distance of each
example from its class center. Test the performance of the algorithm as a
function of the parameters K.
import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import pairwise_distances
def kmeans(data, k, max_iter=100):
"""
Implements the K-means clustering algorithm.
Args:
data: A NumPy array containing the data points.
k: The number of clusters.
max_iter: The maximum number of iterations.
Returns:
A tuple containing:
- The cluster assignments for each data point.
- The cluster centroids.
"""
# Initialize centroids randomly
centroids = data[np.random.choice(data.shape[0], k, replace=False)]
for _ in range(max_iter):
# Assign each data point to the nearest centroid
distances = pairwise_distances(data, centroids)
labels = np.argmin(distances, axis=1)
# Update centroids
new_centroids = np.array([data[labels == i].mean(axis=0) for i in
range(k)])
# Check for convergence
if np.allclose(centroids, new_centroids):
break
centroids = new_centroids
return labels, centroids
def evaluate_kmeans(data, labels, centroids):
"""
Evaluates the K-means clustering by calculating the sum of Euclidean
distances
of each example from its class center.
Args:
data: A NumPy array containing the data points.
labels: The cluster assignments for each data point.
centroids: The cluster centroids.
Returns:
The sum of Euclidean distances.
"""
distances = pairwise_distances(data, centroids[labels])
return np.sum(distances)
# Load the Iris dataset
iris = load_iris()
data = iris.data
# Test the performance of K-means for different values of K
k_values = range(2, 11)
results = []
for k in k_values:
labels, centroids = kmeans(data, k)
distance_sum = evaluate_kmeans(data, labels, centroids)
results.append(distance_sum)
# Plot the results
import matplotlib.pyplot as plt
plt.plot(k_values, results, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Sum of Euclidean Distances')
plt.title('K-means Performance on Iris Dataset')
plt.show()