Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
15 views7 pages

Ifm Group2 Code

The document outlines a data analysis process using Python, focusing on time series stationarity, differencing, and normalization of financial data from an Excel file. It includes steps for checking stationarity, performing OLS regression, conducting Johansen cointegration tests, and running Granger causality tests. The results, including descriptive statistics and visualizations, are saved back to the Excel file for further analysis.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
15 views7 pages

Ifm Group2 Code

The document outlines a data analysis process using Python, focusing on time series stationarity, differencing, and normalization of financial data from an Excel file. It includes steps for checking stationarity, performing OLS regression, conducting Johansen cointegration tests, and running Granger causality tests. The results, including descriptive statistics and visualizations, are saved back to the Excel file for further analysis.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 7

import pandas as pd

import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, coint, grangercausalitytests
from scipy.stats import shapiro
import pandas as pd

# Load the Excel file


file_path = "/content/Stationarized_IFM_data_v2.xlsx" # Update this if needed
df = pd.read_excel(file_path)

# Display column names


print("Columns in the dataset:", df.columns)

import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller

# Load the dataset


file_path = "/content/Stationarized_IFM_data_v2.xlsx"
df = pd.read_excel(file_path)

# Function to check stationarity


def check_stationarity(series, significance=0.05):
# Check if the series is empty or only contains NaNs after dropna()
if series.dropna().empty:
return True # Consider an empty series as stationary to avoid the error
result = adfuller(series.dropna())
return result[1] < significance # Returns True if stationary

# Apply differencing iteratively until all attributes become stationary


stationary_df = df.copy()
columns_to_check = stationary_df.columns[1:] # Exclude 'Group' column

for col in columns_to_check:


diff_count = 0
while not check_stationarity(stationary_df[col]):
stationary_df[col] = stationary_df[col].diff().dropna()
diff_count += 1
if diff_count > 5: # Prevent infinite loops
print(f"{col} is not stationary even after 5 differences.")
break
print(f"{col} became stationary after {diff_count} differencing(s).")
# Save the stationary dataset
stationary_file_path = "/content/Stationarized_IFM_data_v2.xlsx"
stationary_df.to_excel(stationary_file_path, index=False)
print(f"Stationary dataset saved at {stationary_file_path}")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller

# Load data
file_path = "/content/Stationarized_IFM_data_v2.xlsx"
df = pd.read_excel(file_path)

# List of columns to check for stationarity


columns = ['Index', 'ILSSPOT', 'ILSFORWARDS', 'INRSPOT', 'INRFORWARDS', 'EGPSPOT',
'EGPFORWARDS', 'IQDSPOT', 'IQDFORWARDS', 'JODSPOT', 'JODFORWARDS',
'Unnamed: 11', 'how to retrive data']

# Dictionary to store differencing counts


diff_counts = {col: 0 for col in columns}

# Function to check stationarity using ADF test


def is_stationary(series, significance=0.05):
# Check if the series is empty or only contains NaNs after dropna()
if series.dropna().empty:
return True # Consider an empty series as stationary to avoid the error
result = adfuller(series.dropna(), autolag='AIC')
return result[1] < significance

# Create a copy of the original dataframe for plotting


df_original = df.copy()

# Apply differencing iteratively until stationarity is achieved


for col in columns:
temp_series = df[col].copy()
while not is_stationary(temp_series) and not temp_series.dropna().empty: #check
if series is empty after dropna()
temp_series = temp_series.diff().dropna()
diff_counts[col] += 1
df[col] = temp_series # Store transformed data

# Save the stationary dataset


output_path = "/content/Stationarized_IFM_data_v2.xlsx"
df.to_excel(output_path, index=False)

# Plot original vs stationary data


fig, axes = plt.subplots(nrows=len(columns), ncols=2, figsize=(12, 18))
fig.suptitle("Original vs Stationary Time Series", fontsize=14)

for i, col in enumerate(columns):


# Align the original and stationary series properly
original_series = df_original[col].dropna()
stationary_series = df[col].dropna()

axes[i, 0].plot(original_series.index, original_series, label="Original")


axes[i, 0].set_title(f"Original {col}")

axes[i, 1].plot(stationary_series.index, stationary_series, label="Stationary",


color="red")
axes[i, 1].set_title(f"Stationary {col} after {diff_counts[col]}
differencing(s)")

plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

import pandas as pd

# Load the stationary dataset


file_path = "/content/Stationarized_IFM_data_v2.xlsx"
df = pd.read_excel(file_path)
# Generate descriptive statistics
descriptive_stats = df.describe()

# Save the descriptive statistics to an Excel file


output_path = "/content/Stationarized_IFM_data_v2.xlsx"
descriptive_stats.to_excel(output_path)

# Display the statistics


print(descriptive_stats)

print(f"\n✅ Descriptive statistics saved at {output_path}")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

# Load dataset
file_path = "/content/Stationarized_IFM_data_v2.xlsx"
df = pd.read_excel(file_path)

# Handling missing & infinite values


# Replace Inf values with NaN, but fill NaNs with 0 to avoid dropping all rows
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True) # Fill NaN values with 0

variable_pairs = [
("ILSFORWARDS", "ILSSPOT"),
("ILSFORWARDS", "INRSPOT"),
("ILSFORWARDS", "IQDSPOT"),
("ILSFORWARDS", "JODSPOT"),
("ILSFORWARDS", "EGPSPOT")
]

# Perform OLS regression and visualize results


for x_var, y_var in variable_pairs:
plt.figure(figsize=(8, 5))

# Scatter plot with regression line


sns.regplot(x=df[x_var], y=df[y_var], ci=None, line_kws={"color": "red"},
scatter_kws={"alpha": 0.5})

# Fit OLS model


X = sm.add_constant(df[x_var]) # Add constant for intercept
model = sm.OLS(df[y_var], X).fit()

# Print regression summary


print(f"\n📌 OLS Regression Results for {x_var} vs {y_var}")
print(model.summary())

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.vector_ar.vecm import coint_johansen
# Load the dataset
file_path = "/content/Stationarized_IFM_data_v2.xlsx"
df = pd.read_excel(file_path)

# Handle missing values: Forward fill and backward fill


df.fillna(method='ffill', inplace=True) # Fill forward
df.fillna(method='bfill', inplace=True) # Fill backward

# Drop irrelevant columns if present


if 'Unnamed: 11' in df.columns or 'how to retrive data' in df.columns:
df = df.drop(columns=['Unnamed: 11', 'how to retrive data'])

# Select numeric columns only


numeric_df = df.select_dtypes(include=np.number)

# Exclude 'Index' column if present


if 'Index' in numeric_df.columns:
numeric_df = numeric_df.drop(columns=['Index'])

# Check if numeric_df has sufficient variables and observations


# Check if numeric_df has sufficient variables and observations
if len(numeric_df.columns) < 2:
raise ValueError("Not enough numeric variables for cointegration test! "
f"You have {len(numeric_df.columns)} variables; at least 2 are
required.")

# Ensure that you have more observations than variables for the test.
# If not, reduce the number of variables or increase the observations.
if numeric_df.shape[0] <= len(numeric_df.columns):
print("Reducing the number of variables to match the number of observations.")
# Select a subset of variables to use in the test.
# You may need to choose variables based on domain knowledge.
# For this example, selecting the first n-1 variables where n is the number of
observations.
selected_vars = numeric_df.columns[:numeric_df.shape[0] - 1]
numeric_df = numeric_df[selected_vars]
print(f"Selected variables: {selected_vars}")

# Recheck conditions for Johansen test


num_vars = len(numeric_df.columns)
num_obs = numeric_df.shape[0]

if num_vars < 2 or num_obs <= num_vars:


raise ValueError(f"Not enough data for cointegration test! You have {num_vars}
variables "
f"and {num_obs} observations. Adjust your data.")

# Visualize the correlation matrix


plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

def johansen_test(data, k_ar_diff=2):


# Convert to NumPy array
data_array = data.to_numpy()

# Check shape of the data


print("Data shape:", data_array.shape)
if data_array.shape[0] <= data_array.shape[1]:
raise ValueError("Not enough observations for the Johansen test! Ensure
rows > columns.")

# Run Johansen test


result = coint_johansen(data_array, det_order=0, k_ar_diff=k_ar_diff)

# Print results
print("\n💡 Johansen Cointegration Test Results:")
for i in range(len(result.lr1)):
trace_stat = result.lr1[i]
crit_value = result.cvt[i, 1] # 5% critical value

if trace_stat > crit_value:


print(f"✅ Cointegration found at rank {i+1}: Trace Statistic =
{trace_stat:.4f}, Critical Value (5%) = {crit_value:.4f}")
else:
print(f"❌ No cointegration at rank {i+1}: Trace Statistic =
{trace_stat:.4f}, Critical Value (5%) = {crit_value:.4f}")

import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import QuantileTransformer

# Function for Shapiro-Wilk test


def shapiro_test(series):
# Check if the series is numeric
if pd.api.types.is_numeric_dtype(series):
p_value = stats.shapiro(series)[1]
return p_value
else:
return np.nan # Return NaN for non-numeric series

# Changed "ISLFORWARDS" to "ILSFORWARDS"


# Apply Rank-Based Inverse Normal Transformation to ILSFORWARDS, if not normal
if shapiro_test(df["ILSFORWARDS"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["ILSFORWARDS"] = qt.fit_transform(df[["ILSFORWARDS"]])
if shapiro_test(df["INRFORWARDS"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["INRFORWARDS"] = qt.fit_transform(df[["INRFORWARDS"]])
if shapiro_test(df["EGPFORWARDS"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["EGPFORWARDS"] = qt.fit_transform(df[["EGPFORWARDS"]])
if shapiro_test(df["IQDFORWARDS"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["IQDFORWARDS"] = qt.fit_transform(df[["IQDFORWARDS"]])
if shapiro_test(df["JODFORWARDS"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["JODFORWARDS"] = qt.fit_transform(df[["JODFORWARDS"]])
if shapiro_test(df["ILSSPOT"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["ILSSPOT"] = qt.fit_transform(df[["ILSSPOT"]])
if shapiro_test(df["INRSPOT"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["INRSPOT"] = qt.fit_transform(df[["INRSPOT"]])
if shapiro_test(df["EGPSPOT"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["EGPSPOT"] = qt.fit_transform(df[["EGPSPOT"]])
if shapiro_test(df["IQDSPOT"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["IQDSPOT"] = qt.fit_transform(df[["IQDSPOT"]])
if shapiro_test(df["JODSPOT"]) < 0.05:
qt = QuantileTransformer(output_distribution="normal", random_state=42)
df["JODSPOT"] = qt.fit_transform(df[["JODSPOT"]])
# Recheck normality, but only for numeric columns
numeric_columns = df.select_dtypes(include=np.number).columns
shapiro_results = {col: shapiro_test(df[col]) for col in numeric_columns}

# Save the final normalized dataset


output_path = "/content/Stationarized_IFM_data_v2.xlsx"
df.to_excel(output_path, index=False)

# Print results
print("✅ Final Normality Check After Fixing ISLFORWARDS:")
for col, p_value in shapiro_results.items():
if not np.isnan(p_value): # Skip non-numeric columns
status = "✅ Normally distributed" if p_value > 0.05 else "❌ NOT normally
distributed"
print(f"{col}: p = {p_value:.5f} → {status}")

print(f"📂 Fully normalized dataset saved at {output_path}")

import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import grangercausalitytests

# Load the stationary dataset


file_path = "/content/Stationarized_IFM_data_v2.xlsx"
df = pd.read_excel(file_path, index_col=0) # Assuming the first column is
DATE/GROUP

# Drop any remaining NaN values after differencing


df.dropna(inplace=True)

# Define max lags for the test


max_lag = max(1, min(4, int((df.shape[0] - 1) / 3))) # Ensure it's at least 1

# Function to perform Granger Causality Test between all variables


def granger_causality_matrix(data, max_lag, test='ssr_chi2test'):
"""
Returns a DataFrame with p-values of the Granger Causality test.
Each cell (row, column) indicates if the column variable Granger-causes the row
variable.
"""
numeric_data = data.select_dtypes(include=[np.number])
variables = numeric_data.columns # Get column names

# Create an empty matrix to store p-values


result_matrix = pd.DataFrame(np.zeros((len(variables), len(variables))),
columns=variables, index=variables)

for col_y in variables: # Dependent variable (y)


for col_x in variables: # Independent variable (x)
if col_x != col_y:
try:
# Check data size
if numeric_data[[col_y, col_x]].shape[0] <= max_lag:
raise ValueError(f"Not enough rows for max_lag={max_lag}
between {col_y} and {col_x}.")

# Check for constant data


if numeric_data[[col_y, col_x]].var().min() == 0:
raise ValueError(f"One or both columns ({col_y}, {col_x})
have zero variance.")

# Perform Granger causality test


test_result = grangercausalitytests(numeric_data[[col_y,
col_x]], maxlag=max_lag, verbose=False)
p_value = test_result[max_lag][0][test][1] # Extract p-value
result_matrix.loc[col_y, col_x] = p_value

except Exception as e:
print(f"Error testing causality between {col_y} and {col_x}:
{e}")
result_matrix.loc[col_y, col_x] = np.nan # Assign NaN if test
fails

return result_matrix

# Run Granger Causality test


granger_results = granger_causality_matrix(df, max_lag)

# Save results to an Excel file


output_path = "/content/Stationarized_IFM_data_v2.xlsx"
granger_results.to_excel(output_path)

print("\n✅ Granger Causality Test completed! Results saved at:", output_path)

import pandas as pd
results = pd.read_excel("/content/Stationarized_IFM_data_v2.xlsx", index_col=0)
print(results)

You might also like