# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
# Download NLTK resources (if not downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Load the dataset (assuming it's in CSV format)
data = pd.read_csv('/news.csv') # Replace 'your_dataset.csv' with your file name
# Explore the dataset
print(data.head()) # Check the first few rows
print(data.info()) # Get information about the dataset
# Data preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
# Convert text to lowercase
text = text.lower()
# Remove special characters and digits
text = re.sub(r'\W', ' ', text)
text = re.sub(r'\d', ' ', text)
# Tokenize the text
words = word_tokenize(text)
# Remove stop words and lemmatize tokens
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
# Join words back into text
processed_text = ' '.join(words)
return processed_text
data['text'] = data['text'].apply(preprocess_text)
# Feature extraction
X = data['text']
y = data['label']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)
# Model building - using Passive Aggressive Classifier
model = PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train, y_train)
# Prediction
y_pred = model.predict(tfidf_test)
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Confusion Matrix:\n{conf_matrix}")