This repository contains all the projects that were completed as part of Springboard's Data Science Career Track. However, it does not include the capstone and visualisation projects. They are available as separate repositories.
Disclaimer: If you are a Springboard DSC Student, I strongly suggest you refrain from viewing the code before you've actually attempted at solving the problem yourself.
- Understanding Country Club Database with SQL - Manipulating data in SQL
- Analyzing World Bank Projects - Data Wrangling with JSON file
- API Project - Quandl - Data Wrangling
- What is the true Normal Human Body Temperature - Inferential Statistics
- Examining Racial Discrimination in the US Job Market - Inferential Statistics
- Hospital Readmission Analysis and Recommendations - Inferential Statistics
- Predicting House Prices using Linear Regression - Supervised Machine Learning Project
- Predicting Gender using Logistic Regression - Supervised Machine Learning Project
- Movie Review Sentiment Analysis using Naive Bayes - Supervised Machine Learning Project
- Wine Customer Segmentation using Unsupervised Learning - Unsupervised Machine Learning Project
- Spark Project-Databricks
- Ultimate Inc. Data Science Challenge - Time Series Project
- Relax Inc. Data Science Challenge
import pandas as pd import json from typing import Dict, List, Any, Union
def expand_nested_json(data: Union[List[Dict], Dict], separator: str = "_") -> pd.DataFrame:
# If input is a dictionary, convert to a list containing that dictionary
if isinstance(data, dict):
data = [data]
# If input is a string (JSON), parse it
if isinstance(data, str):
data = json.loads(data)
if isinstance(data, dict):
data = [data]
# First convert to DataFrame
df = pd.DataFrame(data)
# Function to flatten nested columns
def flatten_nested_columns(df: pd.DataFrame, separator: str = "_") -> pd.DataFrame:
# Create a copy to avoid modifying the original DataFrame
result_df = df.copy()
# Find columns with dictionaries or lists
nested_columns = [
col for col in result_df.columns
if any(isinstance(val, (dict, list)) for val in result_df[col].dropna())
]
# No nested columns to expand
if not nested_columns:
return result_df
# Process each nested column
for col in nested_columns:
# Handle dictionary columns
if any(isinstance(val, dict) for val in result_df[col].dropna()):
# Convert column to DataFrame
expanded = pd.json_normalize(
result_df[col].apply(lambda x: {} if pd.isna(x) else x)
)
# Rename columns with prefix
expanded.columns = [f"{col}{separator}{subcol}" for subcol in expanded.columns]
# Drop the original column and join with expanded columns
result_df = result_df.drop(col, axis=1).join(expanded)
# Handle list columns
elif any(isinstance(val, list) for val in result_df[col].dropna()):
# Handle lists of dictionaries
if any(isinstance(item, dict) for sublist in result_df[col].dropna() for item in sublist if sublist):
# Create a temporary column with the index
result_df['_temp_idx'] = range(len(result_df))
# Explode the list column into separate rows
exploded = result_df[[col, '_temp_idx']].explode(col)
# Normalize the exploded dictionaries
if not exploded.empty and any(isinstance(val, dict) for val in exploded[col].dropna()):
expanded = pd.json_normalize(
exploded[col].apply(lambda x: {} if pd.isna(x) else x)
)
# Prefix column names
expanded.columns = [f"{col}{separator}{subcol}" for subcol in expanded.columns]
# Join with the index column
expanded['_temp_idx'] = exploded['_temp_idx'].values
# Group by index and convert expanded columns to lists
grouped = expanded.groupby('_temp_idx').agg(list)
# Join with the original DataFrame
result_df = result_df.drop(col, axis=1).join(grouped, on='_temp_idx')
# Clean up temporary index column
result_df = result_df.drop('_temp_idx', axis=1)
# Handle simple lists (strings, numbers)
else:
# Convert lists to strings for simple representation
result_df[col] = result_df[col].apply(
lambda x: json.dumps(x) if isinstance(x, list) else x
)
# Recursively process any new nested columns that were created
return flatten_nested_columns(result_df, separator)
# Apply the recursive flattening
flattened_df = flatten_nested_columns(df, separator)
return flattened_df