import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, coint, grangercausalitytests
from scipy.stats import shapiro
import pandas as pd
# Load the Excel file
file_path = "/content/Stationarized_IFM_data_v2.xlsx" # Update this if needed
df = pd.read_excel(file_path)
# Display column names
print("Columns in the dataset:", df.columns)
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
# Load the dataset
file_path = "/content/Stationarized_IFM_data_v2.xlsx"
df = pd.read_excel(file_path)
# Function to check stationarity
def check_stationarity(series, significance=0.05):
# Check if the series is empty or only contains NaNs after dropna()
if series.dropna().empty:
return True # Consider an empty series as stationary to avoid the error
result = adfuller(series.dropna())
return result[1] < significance # Returns True if stationary
# Apply differencing iteratively until all attributes become stationary
stationary_df = df.copy()
columns_to_check = stationary_df.columns[1:] # Exclude 'Group' column
for col in columns_to_check:
diff_count = 0
while not check_stationarity(stationary_df[col]):
stationary_df[col] = stationary_df[col].diff().dropna()
diff_count += 1
if diff_count > 5: # Prevent infinite loops
print(f"{col} is not stationary even after 5 differences.")
break
print(f"{col} became stationary after {diff_count} differencing(s).")
# Save the stationary dataset
stationary_file_path = "/content/Stationarized_IFM_data_v2.xlsx"
stationary_df.to_excel(stationary_file_path, index=False)
print(f"Stationary dataset saved at {stationary_file_path}")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
# Load data
file_path = "/content/Stationarized_IFM_data_v2.xlsx"
df = pd.read_excel(file_path)
# List of columns to check for stationarity
columns = ['Index', 'ILSSPOT', 'ILSFORWARDS', 'INRSPOT', 'INRFORWARDS', 'EGPSPOT',
'EGPFORWARDS', 'IQDSPOT', 'IQDFORWARDS', 'JODSPOT', 'JODFORWARDS',
'Unnamed: 11', 'how to retrive data']
# Dictionary to store differencing counts
diff_counts = {col: 0 for col in columns}
# Function to check stationarity using ADF test
def is_stationary(series, significance=0.05):
# Check if the series is empty or only contains NaNs after dropna()
if series.dropna().empty:
return True # Consider an empty series as stationary to avoid the error
result = adfuller(series.dropna(), autolag='AIC')
return result[1] < significance
# Create a copy of the original dataframe for plotting
df_original = df.copy()
# Apply differencing iteratively until stationarity is achieved
for col in columns:
temp_series = df[col].copy()
while not is_stationary(temp_series) and not temp_series.dropna().empty: #check
if series is empty after dropna()
temp_series = temp_series.diff().dropna()
diff_counts[col] += 1
df[col] = temp_series # Store transformed data
# Save the stationary dataset
output_path = "/content/Stationarized_IFM_data_v2.xlsx"
df.to_excel(output_path, index=False)
# Plot original vs stationary data
fig, axes = plt.subplots(nrows=len(columns), ncols=2, figsize=(12, 18))
fig.suptitle("Original vs Stationary Time Series", fontsize=14)
for i, col in enumerate(columns):
# Align the original and stationary series properly
original_series = df_original[col].dropna()
stationary_series = df[col].dropna()
axes[i, 0].plot(original_series.index, original_series, label="Original")
axes[i, 0].set_title(f"Original {col}")
axes[i, 1].plot(stationary_series.index, stationary_series, label="Stationary",
color="red")
axes[i, 1].set_title(f"Stationary {col} after {diff_counts[col]}
differencing(s)")
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()
import pandas as pd
# Load the stationary dataset
file_path = "/content/Stationarized_IFM_data_v2.xlsx"
df = pd.read_excel(file_path)
# Generate descriptive statistics
descriptive_stats = df.describe()
# Save the descriptive statistics to an Excel file
output_path = "/content/Stationarized_IFM_data_v2.xlsx"
descriptive_stats.to_excel(output_path)
# Display the statistics
print(descriptive_stats)
print(f"\n✅ Descriptive statistics saved at {output_path}")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
# Load dataset
file_path = "/content/Stationarized_IFM_data_v2.xlsx"
df = pd.read_excel(file_path)
# Handling missing & infinite values
# Replace Inf values with NaN, but fill NaNs with 0 to avoid dropping all rows
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True) # Fill NaN values with 0
variable_pairs = [
("ILSFORWARDS", "ILSSPOT"),
("ILSFORWARDS", "INRSPOT"),
("ILSFORWARDS", "IQDSPOT"),
("ILSFORWARDS", "JODSPOT"),
("ILSFORWARDS", "EGPSPOT")
]
# Perform OLS regression and visualize results
for x_var, y_var in variable_pairs:
plt.figure(figsize=(8, 5))
# Scatter plot with regression line
sns.regplot(x=df[x_var], y=df[y_var], ci=None, line_kws={"color": "red"},
scatter_kws={"alpha": 0.5})
# Fit OLS model
X = sm.add_constant(df[x_var]) # Add constant for intercept
model = sm.OLS(df[y_var], X).fit()
# Print regression summary
print(f"\n📌 OLS Regression Results for {x_var} vs {y_var}")
print(model.summary())
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.vector_ar.vecm import coint_johansen
# Load the dataset
file_path = "/content/Stationarized_IFM_data_v2.xlsx"
df = pd.read_excel(file_path)
# Handle missing values: Forward fill and backward fill
df.fillna(method='ffill', inplace=True) # Fill forward
df.fillna(method='bfill', inplace=True) # Fill backward
# Drop irrelevant columns if present
if 'Unnamed: 11' in df.columns or 'how to retrive data' in df.columns:
df = df.drop(columns=['Unnamed: 11', 'how to retrive data'])
# Select numeric columns only
numeric_df = df.select_dtypes(include=np.number)
# Exclude 'Index' column if present
if 'Index' in numeric_df.columns:
numeric_df = numeric_df.drop(columns=['Index'])
# Check if numeric_df has sufficient variables and observations
# Check if numeric_df has sufficient variables and observations
if len(numeric_df.columns) < 2:
raise ValueError("Not enough numeric variables for cointegration test! "
f"You have {len(numeric_df.columns)} variables; at least 2 are
required.")
# Ensure that you have more observations than variables for the test.
# If not, reduce the number of variables or increase the observations.
if numeric_df.shape[0] <= len(numeric_df.columns):
print("Reducing the number of variables to match the number of observations.")
# Select a subset of variables to use in the test.
# You may need to choose variables based on domain knowledge.
# For this example, selecting the first n-1 variables where n is the number of
observations.
selected_vars = numeric_df.columns[:numeric_df.shape[0] - 1]
numeric_df = numeric_df[selected_vars]
print(f"Selected variables: {selected_vars}")
# Recheck conditions for Johansen test
num_vars = len(numeric_df.columns)
num_obs = numeric_df.shape[0]
if num_vars < 2 or num_obs <= num_vars:
raise ValueError(f"Not enough data for cointegration test! You have {num_vars}
variables "
f"and {num_obs} observations. Adjust your data.")
# Visualize the correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()
def johansen_test(data, k_ar_diff=2):
# Convert to NumPy array
data_array = data.to_numpy()
# Check shape of the data
print("Data shape:", data_array.shape)
if data_array.shape[0] <= data_array.shape[1]:
raise ValueError("Not enough observations for the Johansen test! Ensure
rows > columns.")
# Run Johansen test
result = coint_johansen(data_array, det_order=0, k_ar_diff=k_ar_diff)
# Print results
print("\n💡 Johansen Cointegration Test Results:")
for i in range(len(result.lr1)):
trace_stat = result.lr1[i]
crit_value = result.cvt[i, 1] # 5% critical value
if trace_stat > crit_value:
print(f"✅ Cointegration found at rank {i+1}: Trace Statistic =
{trace_stat:.4f}, Critical Value (5%) = {crit_value:.4f}")
else:
print(f"❌ No cointegration at rank {i+1}: Trace Statistic =
{trace_stat:.4f}, Critical Value (5%) = {crit_value:.4f}")
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import QuantileTransformer
# Function for Shapiro-Wilk test
def shapiro_test(series):
# Check if the series is numeric
if pd.api.types.is_numeric_dtype(series):
p_value = stats.shapiro(series)[1]
return p_value
else:
return np.nan # Return NaN for non-numeric series
# Changed "ISLFORWARDS" to "ILSFORWARDS"
# Apply Rank-Based Inverse Normal Transformation to ILSFORWARDS, if not normal
if shapiro_test(df["ILSFORWARDS"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["ILSFORWARDS"] = qt.fit_transform(df[["ILSFORWARDS"]])
if shapiro_test(df["INRFORWARDS"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["INRFORWARDS"] = qt.fit_transform(df[["INRFORWARDS"]])
if shapiro_test(df["EGPFORWARDS"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["EGPFORWARDS"] = qt.fit_transform(df[["EGPFORWARDS"]])
if shapiro_test(df["IQDFORWARDS"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["IQDFORWARDS"] = qt.fit_transform(df[["IQDFORWARDS"]])
if shapiro_test(df["JODFORWARDS"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["JODFORWARDS"] = qt.fit_transform(df[["JODFORWARDS"]])
if shapiro_test(df["ILSSPOT"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["ILSSPOT"] = qt.fit_transform(df[["ILSSPOT"]])
if shapiro_test(df["INRSPOT"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["INRSPOT"] = qt.fit_transform(df[["INRSPOT"]])
if shapiro_test(df["EGPSPOT"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["EGPSPOT"] = qt.fit_transform(df[["EGPSPOT"]])
if shapiro_test(df["IQDSPOT"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["IQDSPOT"] = qt.fit_transform(df[["IQDSPOT"]])
if shapiro_test(df["JODSPOT"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["JODSPOT"] = qt.fit_transform(df[["JODSPOT"]])
# Recheck normality, but only for numeric columns
numeric_columns = df.select_dtypes(include=np.number).columns
shapiro_results = {col: shapiro_test(df[col]) for col in numeric_columns}
# Save the final normalized dataset
output_path = "/content/Stationarized_IFM_data_v2.xlsx"
df.to_excel(output_path, index=False)
# Print results
print("✅ Final Normality Check After Fixing ISLFORWARDS:")
for col, p_value in shapiro_results.items():
if not np.isnan(p_value): # Skip non-numeric columns
status = "✅ Normally distributed" if p_value > 0.05 else "❌ NOT normally
distributed"
print(f"{col}: p = {p_value:.5f} → {status}")
print(f"📂 Fully normalized dataset saved at {output_path}")
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import grangercausalitytests
# Load the stationary dataset
file_path = "/content/Stationarized_IFM_data_v2.xlsx"
df = pd.read_excel(file_path, index_col=0) # Assuming the first column is
DATE/GROUP
# Drop any remaining NaN values after differencing
df.dropna(inplace=True)
# Define max lags for the test
max_lag = max(1, min(4, int((df.shape[0] - 1) / 3))) # Ensure it's at least 1
# Function to perform Granger Causality Test between all variables
def granger_causality_matrix(data, max_lag, test='ssr_chi2test'):
"""
Returns a DataFrame with p-values of the Granger Causality test.
Each cell (row, column) indicates if the column variable Granger-causes the row
variable.
"""
numeric_data = data.select_dtypes(include=[np.number])
variables = numeric_data.columns # Get column names
# Create an empty matrix to store p-values
result_matrix = pd.DataFrame(np.zeros((len(variables), len(variables))),
columns=variables, index=variables)
for col_y in variables: # Dependent variable (y)
for col_x in variables: # Independent variable (x)
if col_x != col_y:
try:
# Check data size
if numeric_data[[col_y, col_x]].shape[0] <= max_lag:
raise ValueError(f"Not enough rows for max_lag={max_lag}
between {col_y} and {col_x}.")
# Check for constant data
if numeric_data[[col_y, col_x]].var().min() == 0:
raise ValueError(f"One or both columns ({col_y}, {col_x})
have zero variance.")
# Perform Granger causality test
test_result = grangercausalitytests(numeric_data[[col_y,
col_x]], maxlag=max_lag, verbose=False)
p_value = test_result[max_lag][0][test][1] # Extract p-value
result_matrix.loc[col_y, col_x] = p_value
except Exception as e:
print(f"Error testing causality between {col_y} and {col_x}:
{e}")
result_matrix.loc[col_y, col_x] = np.nan # Assign NaN if test
fails
return result_matrix
# Run Granger Causality test
granger_results = granger_causality_matrix(df, max_lag)
# Save results to an Excel file
output_path = "/content/Stationarized_IFM_data_v2.xlsx"
granger_results.to_excel(output_path)
print("\n✅ Granger Causality Test completed! Results saved at:", output_path)
import pandas as pd
results = pd.read_excel("/content/Stationarized_IFM_data_v2.xlsx", index_col=0)
print(results)