# === CFTC COT → Excel Leaderboard (TEXT WIZARD, dynamic columns, disk
cache) ===
# - Text prompts only (works in notebook/terminal).
# - Discovers all available columns dynamically from the chosen report.
# - Lets you pick ANY column, transform it, and rank cross-sectionally using last
week or last N weeks.
# - Minimal downloads + persistent on-disk cache (survives kernel restarts).
# - Outputs clean Excel with Rankings + Params (+ optional TopN series).
# ------------------------------------------------------------------------------
# 0) Limit BLAS threads (helps stability). Do this BEFORE importing numpy/pandas.
import os
for k in
["OMP_NUM_THREADS","OPENBLAS_NUM_THREADS","MKL_NUM_THREADS",
"NUMEXPR_NUM_THREADS"]:
os.environ.setdefault(k, "1")
from __future__ import annotations
import re, warnings, time, json, math, pathlib, sys
from typing import List, Tuple, Dict, OrderedDict as OD_T
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor, as_completed
# 1) Optional progress bar
try:
from tqdm import tqdm
_HAS_TQDM = True
except Exception:
_HAS_TQDM = False
# 2) Core libs
import numpy as np
import pandas as pd
# 3) Optional HTTP cache (speeds up repeat CFTC fetches this session)
try:
import requests_cache
requests_cache.install_cache("cot_http_cache", expire_after=3600)
except Exception:
pass
# 4) pycot import (supports both spellings)
try:
from pycot.reports import CommitmentOfTraders as COT
except Exception:
from pycot.reports import CommitmentsOfTraders as COT
# Quiet down noisy dateutil warning from pycot
warnings.filterwarnings(
"ignore",
message="Could not infer format, so each element will be parsed individually,
falling back to `dateutil`.",
module="pycot.reports",
)
# ---------------------- Disk cache ----------------------
CACHE_DIR = pathlib.Path("./cot_cache_v2")
CACHE_DIR.mkdir(parents=True, exist_ok=True)
def _safe_name(txt: str) -> str:
return re.sub(r"[^A-Za-z0-9_.-]+", "_", txt.strip())
def _df_cache_path(report_code: str, contract: str) -> pathlib.Path:
return CACHE_DIR /
f"{_safe_name(report_code)}__{_safe_name(contract)}.parquet"
def _meta_path(report_code: str) -> pathlib.Path:
return CACHE_DIR / f"{_safe_name(report_code)}__meta.json"
def _choices_path(report_code: str) -> pathlib.Path:
return CACHE_DIR / f"{_safe_name(report_code)}__last_choices.json"
def _load_cached_df(report_code: str, contract: str) -> pd.DataFrame | None:
p = _df_cache_path(report_code, contract)
if not p.exists(): return None
try:
df = pd.read_parquet(p)
df.index = pd.to_datetime(df.index).tz_localize(None)
return df
except Exception:
# fallback: maybe it was saved as pickle earlier
try:
df = pd.read_pickle(p.with_suffix(".pkl"))
df.index = pd.to_datetime(df.index).tz_localize(None)
return df
except Exception:
return None
def _save_cached_df(report_code: str, contract: str, df: pd.DataFrame):
p = _df_cache_path(report_code, contract)
try:
df.to_parquet(p)
except Exception:
df.to_pickle(p.with_suffix(".pkl"))
def _load_meta(report_code: str) -> dict:
mp = _meta_path(report_code)
if mp.exists():
try:
return json.loads(mp.read_text())
except Exception:
return {}
return {}
def _save_meta(report_code: str, meta: dict):
_meta_path(report_code).write_text(json.dumps(meta, indent=2, default=str))
def _save_choices(report_code: str, choices: dict):
_choices_path(report_code).write_text(json.dumps(choices, indent=2,
default=str))
# ---------------------- Helpers ----------------------
def _norm(txt: str) -> str:
return re.sub(r"\s+", " ", str(txt).strip().lower())
def _find_col(df, must_include: List[str], must_exclude: List[str] | None = None):
must_exclude = must_exclude or []
for c in df.columns:
lc = _norm(c)
if all(s in lc for s in must_include) and not any(x in lc for x in must_exclude):
return c
return None
def _get_OI(df):
for key in ["open interest","openinterest","oi"]:
c = _find_col(df, [key])
if c: return df[c].astype(float)
if "OI" in df.columns: return df["OI"].astype(float)
raise KeyError("Open Interest column not found.")
def _match_any_synonym(df, keys_any: List[str]):
packs = [
["commercial"],
["producer","merchant","processor","user"], ["prod","merch","proc","user"],
["noncommercial"], ["large","spec"], ["managed","money"],
["dealer"], ["asset","manager"], ["leveraged","funds"], ["swap","dealer"],
["other","reportable"],
]
# Prefer Net %OI if present
for pack in packs:
if not any(k in pack for k in keys_any): continue
pct = _find_col(df, ["net % of oi", *pack])
if pct: return {"net_pct": df[pct].astype(float), "tokens": pack}
# Fallback: long/short pairs
for pack in packs:
if not any(k in pack for k in keys_any): continue
Lc = _find_col(df, ["long", *pack], must_exclude=["%"])
Sc = _find_col(df, ["short",*pack], must_exclude=["%"])
if Lc and Sc:
return {"long": df[Lc].astype(float), "short": df[Sc].astype(float), "tokens":
pack}
return None
def _get_net_pct(df, keys_any: List[str]):
hit = _match_any_synonym(df, keys_any)
if not hit: return None
if "net_pct" in hit: return hit["net_pct"]
OI = _get_OI(df); eps = 1e-9
return 100.0 * ((hit["long"] - hit["short"]) / (OI + eps))
# Rich column enumeration (label → Series)
_COL_REGEX = re.compile(r"(?i)^(?P<kind>net % of oi|net(?:\s*change)?|longs?|
shorts?)" r"(?:\s*[-,:]?\s*)?(?P<group>.+)$")
def _norm_group(raw: str, report_code: str) -> str:
g = _norm(raw)
if "producer/merchant/processor/user" in g or all(k in g for k in
["prod","merch","proc","user"]):
return "Commercial (PM/P/U)"
if "managed money" in g: return "Managed Money"
if "swap dealer" in g: return "Swap Dealers"
if "other report" in g: return "Other Reportables"
if "noncommercial" in g or "large spec" in g: return "Speculators"
if "dealer" in g: return "Dealer"
if "asset" in g and "manager" in g: return "Asset Manager"
if "leveraged" in g and "fund" in g: return "Leveraged Funds"
if "non" in g and "reportable" in g: return "Non-reportable"
if "commercial" in g: return "Commercial"
return raw.strip()
def enumerate_cot_columns(df: pd.DataFrame, report_code: str) -> "OrderedDict[str,
pd.Series]":
out = OrderedDict()
# Open interest
try:
out["Open Interest"] = _get_OI(df)
except Exception:
pass
# Patterned: Net %OI / Net / Net Change / Long / Short
for col in df.columns:
cstr = str(col).strip()
m = _COL_REGEX.match(cstr)
if not m: continue
kind = m.group("kind").lower()
group_lbl = _norm_group(m.group("group"), report_code)
s = df[col].astype(float)
if "net % of oi" in kind:
label = f"{group_lbl} — Net %OI"
elif "net change" in kind:
label = f"{group_lbl} — Net Change"
elif kind.startswith("net"):
label = f"{group_lbl} — Net (contracts)"
elif kind.startswith("long"):
label = f"{group_lbl} — Long"
else:
label = f"{group_lbl} — Short"
out[label] = s
# Synthesize common %OI if missing
if "Commercial — Net %OI" not in out:
s = _get_net_pct(df, ["commercial","producer","prod"])
if s is not None: out["Commercial — Net %OI"] = s
if "Speculators — Net %OI" not in out:
s = _get_net_pct(df, ["noncommercial","large spec","managed
money","leveraged","funds","asset","manager"])
if s is not None: out["Speculators — Net %OI"] = s
# Any remaining numeric columns (to be safe)
for col in df.columns:
name = str(col)
if name.lower() == "contract name": continue
if name in out: continue
s = df[col]
if pd.api.types.is_numeric_dtype(s):
out[name] = s.astype(float)
return out
# ---------------------- Transforms & CS aggregations ----------------------
def _zscore(x: pd.Series, win: int) -> pd.Series:
m = x.rolling(win).mean(); s = x.rolling(win).std(ddof=0)
return (x - m) / (s.replace(0, np.nan) + 1e-9)
def _range_index(x: pd.Series, win: int) -> pd.Series:
mn = x.rolling(win).min(); mx = x.rolling(win).max()
return 100.0 * (x - mn) / ((mx - mn).replace(0, np.nan) + 1e-9)
TRANSFORMS = OrderedDict([
("Level", ("lvl", None)), # (code, default window)
("Index (156w)", ("idx", 156)),
("Z-score", ("z", 156)), # window chosen interactively
("Δ over N", ("dN", None)), # N chosen interactively
])
CS_AGGS = OrderedDict([
("Latest (last 1)", ("last", 1)),
("Average of last N", ("avgN", 4)),
("Sum of last N", ("sumN", 4)),
("Median of last N", ("medN", 4)),
("Change over last N", ("chgN", 4)),
("Min over last N", ("minN", 4)),
("Max over last N", ("maxN", 4)),
])
def apply_transform(s: pd.Series, code: str, win: int | None, N_for_delta: int | None) -
> pd.Series:
code = code.lower()
if code == "lvl":
return s
if code == "idx":
w = int(win or 156)
mn=s.rolling(w).min(); mx=s.rolling(w).max()
return 100.0 * (s - mn) / ((mx - mn).replace(0, np.nan) + 1e-9)
if code == "z":
w = int(win or 156)
return _zscore(s, w)
if code == "dN":
k = int(N_for_delta or 4)
return s - s.shift(k)
return s
def cs_aggregate(s: pd.Series, agg_code: str, N: int) -> float | None:
ss = s.dropna()
if ss.empty: return None
if agg_code == "last":
return float(ss.iloc[-1])
tail = ss.tail(N)
if tail.empty: return None
if agg_code == "avgN": return float(tail.mean())
if agg_code == "sumN": return float(tail.sum())
if agg_code == "medN": return float(tail.median())
if agg_code == "chgN": return float(tail.iloc[-1] - tail.iloc[0])
if agg_code == "minN": return float(tail.min())
if agg_code == "maxN": return float(tail.max())
return float(tail.iloc[-1])
# ---------------------- Extra utilities (new) ----------------------
def winsorize_scalar(x: float, lo: float, hi: float) -> float:
return max(lo, min(hi, x))
def percentile_rank(values: pd.Series) -> pd.Series:
# 0..100 inclusive
n = len(values)
if n <= 1:
return pd.Series([100.0]*n, index=values.index, dtype=float)
ranks = values.rank(method="min", ascending=True) # small→low
return 100.0 * (ranks - 1) / (n - 1)
def decile_bucket(pct: float) -> str:
if pct is None or np.isnan(pct): return ""
d = int(np.floor(pct / 10.0)) # 0..9
return f"D{d+1}"
# ---------------------- Data access (cached + trimmed) ----------------------
# Policy: "reuse" → use disk cache if present; "check" → refresh if cache is older
than last known CFTC date; "clear" → ignore & overwrite.
REFRESH_POLICIES = {"reuse","check","clear"}
def list_contracts(report_type: str) -> List[str]:
raw = COT(report_type).list_available_contracts()
return [str(x) for x in list(raw)]
def _latest_cftc_date(df: pd.DataFrame) -> pd.Timestamp | None:
if df is None or df.empty: return None
return pd.to_datetime(df.index.max()).tz_localize(None)
def _trim_weeks(df: pd.DataFrame, need_weeks: int) -> pd.DataFrame:
if df is None or df.empty: return df
cutoff = pd.Timestamp.today().tz_localize(None) -
pd.DateOffset(weeks=int(need_weeks))
return df.loc[df.index >= cutoff]
def fetch_cot_df(report_type: str, contract: str, *, need_weeks: int, policy: str="check")
-> pd.DataFrame:
policy = (policy or "check").lower()
cached = _load_cached_df(report_type, contract)
if policy == "reuse" and cached is not None:
return _trim_weeks(cached, need_weeks)
if policy == "check" and cached is not None:
# We'll still use cached; pycot layer + requests_cache will avoid heavy traffic
anyway.
return _trim_weeks(cached, need_weeks)
# "clear" OR no cache → fetch
cot = COT(report_type)
df = cot.report((contract,)).sort_index()
df.index = pd.to_datetime(df.index).tz_localize(None)
_save_cached_df(report_type, contract, df)
return _trim_weeks(df, need_weeks)
# ---------------------- Wizard I/O ----------------------
def ask_choice(prompt: str, options: List[str], default_idx: int = 0) -> int:
while True:
print(f"\n{prompt}")
for i, opt in enumerate(options, 1):
dmark = " (default)" if (i-1) == default_idx else ""
print(f" {i}. {opt}{dmark}")
raw = input(f"Enter 1-{len(options)} (Enter={default_idx+1}): ").strip()
if raw == "":
return default_idx
if raw.isdigit():
k = int(raw) - 1
if 0 <= k < len(options):
return k
print("Invalid choice, try again.")
def ask_int(prompt: str, default: int, min_v: int = 1, max_v: int | None = None) -> int:
while True:
raw = input(f"{prompt} (default {default}): ").strip()
if raw == "":
return default
try:
v = int(raw)
if v < min_v:
print(f"Must be ≥ {min_v}")
continue
if max_v is not None and v > max_v:
print(f"Must be ≤ {max_v}")
continue
return v
except Exception:
print("Enter an integer.")
def ask_float(prompt: str, default: float, min_v: float = -1e9, max_v: float = 1e9) ->
float:
while True:
raw = input(f"{prompt} (default {default}): ").strip()
if raw == "":
return default
try:
v = float(raw)
if v < min_v or v > max_v:
print(f"Must be between {min_v} and {max_v}")
continue
return v
except Exception:
print("Enter a number.")
def ask_str(prompt: str, default: str = "") -> str:
raw = input(f"{prompt} (Enter for '{default}'): ").strip()
return raw if raw != "" else default
def ask_yesno(prompt: str, default_yes: bool = True) -> bool:
d = "Y/n" if default_yes else "y/N"
while True:
raw = input(f"{prompt} ({d}): ").strip().lower()
if raw == "" : return default_yes
if raw in ["y","yes"]: return True
if raw in ["n","no"]: return False
print("Please answer y/n.")
# ---------------------- Leaderboard compute ----------------------
def compile_leaderboard_dynamic(
*,
report_code: str,
universe: List[str],
picked_label: str,
transform_code: str,
transform_win: int | None,
delta_N: int | None,
cs_agg_code: str,
cs_N: int,
rank_desc: bool,
need_weeks: int,
refresh_policy: str = "check",
max_workers: int = 1,
# new:
flip_sign: bool = False,
winsor_lo: float | None = None,
winsor_hi: float | None = None,
self_z_win: int = 156,
self_idx_win: int = 156,
) -> pd.DataFrame:
rows: List[dict] = []
dropped: List[str] = []
def _one(name: str):
try:
df = fetch_cot_df(report_code, name, need_weeks=need_weeks,
policy=refresh_policy)
colmap = enumerate_cot_columns(df, report_code)
if picked_label not in colmap:
return None
s = colmap[picked_label].astype(float)
sT = apply_transform(s, transform_code, transform_win, delta_N)
sT = sT.dropna()
if sT.empty:
return None
# self metrics on transformed series
z_self = _zscore(sT, int(self_z_win)).iloc[-1] if len(sT) >= max(8,
self_z_win//4) else np.nan
idx_self = _range_index(sT, int(self_idx_win)).iloc[-1] if len(sT) >= max(8,
self_idx_win//4) else np.nan
val = cs_aggregate(sT, cs_agg_code, cs_N)
if val is None:
return None
if flip_sign:
val = -float(val)
last_dt = pd.to_datetime(sT.index[-1])
return {
"Contract": name,
"Metric": picked_label,
"Value_raw": float(val if not flip_sign else -val), # original, before
sign flip
"Value_eff": float(val), # effective used for ranking (after flip)
"Self_Z": float(z_self) if z_self is not None else np.nan,
"Self_Strength": float(idx_self) if idx_self is not None else np.nan,
"As of": last_dt,
}
except Exception:
dropped.append(name)
return None
total = len(universe)
if int(max_workers) <= 1:
it = tqdm(universe, desc="Scanning", unit="c") if _HAS_TQDM else universe
for n in it:
row = _one(n)
if row: rows.append(row)
else:
with ThreadPoolExecutor(max_workers=int(max_workers)) as ex:
futures = {ex.submit(_one, n): n for n in universe}
if _HAS_TQDM:
pbar = tqdm(total=total, desc="Scanning", unit="c")
for fut in as_completed(futures):
row = fut.result()
if row: rows.append(row)
pbar.update(1)
pbar.close()
else:
done = 0
for fut in as_completed(futures):
row = fut.result()
if row: rows.append(row)
done += 1
if done == 1 or done % 25 == 0 or done == total:
print(f"[{done}/{total}]")
if not rows:
return pd.DataFrame(columns=["Metric","Value","As of","Rank"],
index=pd.Index([], name="Contract"))
df = pd.DataFrame(rows).set_index("Contract")
# optional winsorization on Effective Value (cross-sectional stability)
if winsor_lo is not None or winsor_hi is not None:
lo = df["Value_eff"].quantile(0.01 if winsor_lo is None else winsor_lo)
hi = df["Value_eff"].quantile(0.99 if winsor_hi is None else winsor_hi)
df["Value_eff_w"] = df["Value_eff"].clip(lower=lo, upper=hi)
else:
df["Value_eff_w"] = df["Value_eff"]
# Cross-sectional stats
# Rank: 1 is best if rank_desc=True
df = df.sort_values("Value_eff_w", ascending=not rank_desc)
df["Rank"] = df["Value_eff_w"].rank(ascending=not rank_desc, method="min")
# Percentile & CS-Z
# Use Value_eff_w as the population for CS stats
vals = df["Value_eff_w"]
df["CS_Percentile"] = percentile_rank(vals if rank_desc else -vals) # top gets
100
mu, sd = float(vals.mean()), float(vals.std(ddof=0) or 1.0)
df["CS_Z"] = (vals - mu) / sd
df["CS_Bucket"] = [decile_bucket(p) for p in df["CS_Percentile"]]
# keep classic columns + new ones (preserve your original columns)
out = df[[
"Metric",
"Value_raw", # original sign before flip
"Value_eff", # after sign flip (used)
"Value_eff_w", # winsorized effective (ranked)
"As of",
"Rank",
"CS_Percentile",
"CS_Z",
"CS_Bucket",
"Self_Strength",
"Self_Z",
]].rename(columns={"Value_raw": "Value", "Value_eff": "Value_forRank",
"Value_eff_w": "Value_ranked"})
out.index.name = "Contract"
# Attach diagnostics
out._diagnostics = {
"dropped_count": len(dropped),
"dropped_list": dropped,
"mu": mu,
"sd": sd,
}
return out
def gather_top_series(
*,
report_code: str, universe_ranked: pd.Index, picked_label: str,
transform_code: str, transform_win: int | None, delta_N: int | None,
need_weeks: int, refresh_policy: str, top_n: int = 20
) -> pd.DataFrame:
keep = list(universe_ranked[:max(0, int(top_n))])
frames = []
it = tqdm(keep, desc="Collecting series", unit="c") if _HAS_TQDM else keep
for name in it:
try:
df = fetch_cot_df(report_code, name, need_weeks=need_weeks,
policy=refresh_policy)
colmap = enumerate_cot_columns(df, report_code)
if picked_label not in colmap:
continue
s = colmap[picked_label].astype(float).dropna()
if s.empty: continue
sT = apply_transform(s, transform_code, transform_win,
delta_N).dropna()
if sT.empty: continue
tmp = sT.reset_index(); tmp.columns = ["Date", "Value"]
tmp["Contract"] = name; tmp["Metric"] = picked_label
frames.append(tmp[["Date","Contract","Metric","Value"]])
except Exception as e:
warnings.warn(f"series {name}: {e}")
if not frames:
return pd.DataFrame(columns=["Date","Contract","Metric","Value"])
return pd.concat(frames,
axis=0).sort_values(["Contract","Date"]).reset_index(drop=True)
# ---------------------- TEXT WIZARD ----------------------
def run_wizard():
print("\n=== CFTC COT Leaderboard — Text Wizard ===")
# 1) Report
report_options = [
("Legacy (Futures only)", "legacy_fut"),
("Disaggregated (Fut+Opt)", "disaggregated_futopt"),
("TFF (Financials, Futures)", "traders_in_financial_futures_fut"),
]
i_rep = ask_choice("Choose CFTC report:", [x[0] for x in report_options],
default_idx=2)
report_code = report_options[i_rep][1]
# 2) Universe filter
print(f"\nFetching contract list for: {report_code}")
universe = list_contracts(report_code)
print(f"Total contracts: {len(universe)}")
filt = ask_str("Filter substring for contracts (case-insensitive)", default="")
if filt:
q = filt.lower().strip()
universe = [c for c in universe if q in c.lower()]
print(f"Universe size after filter: {len(universe)}")
if len(universe) == 0:
print("No contracts match your filter. Exiting.")
return
# 3) Prototype load just once to discover ALL columns for this report
print("Sampling first contract to enumerate columns...")
proto_name = universe[0]
proto_df = fetch_cot_df(report_code, proto_name, need_weeks=520,
policy="check") # a couple of years to ensure patterns
colmap = enumerate_cot_columns(proto_df, report_code)
col_labels = list(colmap.keys())
# 4) Let the user choose a column (dynamic!)
i_col = ask_choice("Pick a column/metric to derive and rank:", col_labels,
default_idx=0)
picked_label = col_labels[i_col]
print(f"→ You chose: {picked_label}")
# 5) Transform (pre-CS)
tf_labels = list(TRANSFORMS.keys())
i_tf = ask_choice("Choose a transform to apply before ranking:", tf_labels,
default_idx=0)
tf_code, tf_defwin = TRANSFORMS[tf_labels[i_tf]]
tf_win = None; dN = None
if tf_code == "z":
tf_win = ask_int("Window (weeks) for Z-score", default=int(tf_defwin or 156),
min_v=26, max_v=520)
elif tf_code == "idx":
tf_win = ask_int("Window (weeks) for rolling Index", default=int(tf_defwin or
156), min_v=26, max_v=520)
elif tf_code == "dN":
dN = ask_int("Δ over N weeks — choose N", default=4, min_v=1,
max_v=104)
# 6) Cross-sectional aggregation window & operator
cs_labels = list(CS_AGGS.keys())
i_cs = ask_choice("Choose cross-sectional aggregator over the last N weeks:",
cs_labels, default_idx=0)
cs_code, cs_defN = CS_AGGS[cs_labels[i_cs]]
cs_N = cs_defN if cs_code == "last" else ask_int("N (weeks) for cross-sectional
aggregator", default=cs_defN, min_v=1, max_v=260)
# 7) Rank direction
rank_desc = ask_yesno("Rank with higher values on top?", default_yes=True)
# 7b) Flip sign before ranking? (for metrics where lower is 'better')
flip_sign = ask_yesno("Flip sign BEFORE ranking? (choose 'yes' when more
negative = better)", default_yes=False)
# 7c) Winsorization (optional)
do_winsor = ask_yesno("Winsorize cross-sectional values before ranking?
(stabilizes outliers)", default_yes=False)
winsor_lo = winsor_hi = None
if do_winsor:
winsor_lo = ask_float("Lower quantile (e.g., 0.02 for 2%)", default=0.02,
min_v=0.0, max_v=0.49)
winsor_hi = ask_float("Upper quantile (e.g., 0.98 for 98%)", default=0.98,
min_v=0.51, max_v=1.0)
# 8) Self windows (relative-to-itself metrics)
self_idx_win = ask_int("Window (weeks) for Self Strength (range index 0..100)",
default=156, min_v=26, max_v=520)
self_z_win = ask_int("Window (weeks) for Self Z-score", default=156,
min_v=26, max_v=520)
# 9) Download/compute window needed (keep minimal)
need_weeks = max(
8,
(int(tf_win) if tf_code in {"z","idx"} and tf_win else 0),
(int(dN) if dN else 0),
int(cs_N),
int(self_idx_win),
int(self_z_win),
)
print(f"\nHistory window requested: ~{need_weeks} weeks (auto-trimmed per
contract).")
# 10) Cache policy & workers
rp = ask_choice("Refresh policy for disk cache", ["reuse (fastest)","check
(default)","clear (re-download)"], default_idx=1)
refresh_policy = ["reuse","check","clear"][rp]
max_workers = ask_int("Max workers (1 safest; 2–4 ok on stable setup)",
default=1, min_v=1, max_v=8)
# 11) Excel + series options
excel_out = ask_str("Excel output filename",
default="cot_universe_leaderboard.xlsx")
include_series = ask_yesno("Also export Top-N time series sheet?",
default_yes=False)
top_n_series = 20
if include_series:
top_n_series = ask_int("Top-N contracts to include as time series",
default=20, min_v=1, max_v=200)
# 12) (Optional) limit universe for a quick dry-run
if ask_yesno("Limit universe count for a quick run?", default_yes=False):
cap = ask_int("Keep first K contracts", default=25, min_v=1,
max_v=len(universe))
universe = universe[:cap]
# 13) Optional Excel formatting
apply_xlsx_fmt = ask_yesno("Apply Excel formatting (freeze panes, widths,
heatmaps)?", default_yes=True)
# Persist choices for reproducibility
_save_choices(report_code, {
"report_code": report_code,
"filter": filt or "",
"picked_label": picked_label,
"transform": {"code": tf_code, "win": tf_win, "deltaN": dN},
"cs": {"code": cs_code, "N": cs_N, "rank_desc": rank_desc},
"flip_sign": flip_sign,
"winsor": {"lo": winsor_lo, "hi": winsor_hi} if do_winsor else None,
"self_windows": {"strength": self_idx_win, "z": self_z_win},
"need_weeks": need_weeks,
"refresh_policy": refresh_policy,
"max_workers": max_workers,
"excel_out": excel_out,
"include_series": include_series,
"top_n_series": top_n_series,
"timestamp": pd.Timestamp.now().isoformat(),
})
# ---------------- RUN ----------------
print(time.strftime("%d-%b-%y %H:%M:%S"), f"- Building leaderboard for
report={report_code}")
start = time.time()
ranks = compile_leaderboard_dynamic(
report_code=report_code,
universe=universe,
picked_label=picked_label,
transform_code=tf_code,
transform_win=tf_win,
delta_N=dN,
cs_agg_code=cs_code,
cs_N=cs_N,
rank_desc=rank_desc,
need_weeks=need_weeks,
refresh_policy=refresh_policy,
max_workers=max_workers,
flip_sign=flip_sign,
winsor_lo=winsor_lo, winsor_hi=winsor_hi,
self_z_win=self_z_win, self_idx_win=self_idx_win,
)
took = time.time() - start
print(f"Leaderboard built in {took:.1f}s | Ranked rows: {len(ranks)}")
# Series (optional)
series = pd.DataFrame(columns=["Date","Contract","Metric","Value"])
if include_series and len(ranks) > 0:
print("Collecting Top-N series…")
series = gather_top_series(
report_code=report_code, universe_ranked=ranks.index,
picked_label=picked_label,
transform_code=tf_code, transform_win=tf_win, delta_N=dN,
need_weeks=max(need_weeks, self_idx_win, self_z_win),
refresh_policy=refresh_policy, top_n=top_n_series
)
print(f"Series rows: {len(series)}")
# Params sheet
params = OrderedDict([
("Report", report_code),
("Universe count", len(universe)),
("Picked column", picked_label),
("Transform code", tf_code),
("Transform window", int(tf_win) if tf_win else 0),
("Delta N (for Δ)", int(dN) if dN else 0),
("CS agg", cs_code),
("CS N", int(cs_N)),
("Rank descending?", bool(rank_desc)),
("Flip sign (pre-rank)", bool(flip_sign)),
("Winsor lo", winsor_lo if winsor_lo is not None else ""),
("Winsor hi", winsor_hi if winsor_hi is not None else ""),
("Self Strength win", int(self_idx_win)),
("Self Z win", int(self_z_win)),
("Need weeks (history)", int(need_weeks)),
("Refresh policy", refresh_policy),
("Max workers", int(max_workers)),
("Include series", bool(include_series)),
("TopN series", int(top_n_series if include_series else 0)),
("Generated at", pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")),
])
# Diagnostics sheet info
di = getattr(ranks, "_diagnostics", {"dropped_count": 0, "dropped_list": [], "mu":
np.nan, "sd": np.nan})
diag_rows = [
("Dropped (no data / errors)", di.get("dropped_count", 0)),
("Population mean (ranked value)", di.get("mu", np.nan)),
("Population std (ranked value)", di.get("sd", np.nan)),
]
diag_df = pd.DataFrame(diag_rows, columns=["Key","Value"])
if di.get("dropped_list"):
diag_drop = pd.DataFrame({"Dropped Contracts": di["dropped_list"]})
else:
diag_drop = pd.DataFrame({"Dropped Contracts": []})
# Save Excel (with optional formatting)
engine_kwargs = {}
try:
import xlsxwriter # noqa
engine_kwargs["engine"] = "xlsxwriter"
except Exception:
pass
with pd.ExcelWriter(excel_out, **engine_kwargs) as writer:
# Rankings
ranks_for_excel = ranks.copy()
# Pretty rounding for readability
for col in
["Value","Value_forRank","Value_ranked","CS_Percentile","CS_Z","Self_Strength","S
elf_Z"]:
if col in ranks_for_excel.columns:
ranks_for_excel[col] = pd.to_numeric(ranks_for_excel[col],
errors="coerce").round(4)
ranks_for_excel.to_excel(writer, sheet_name="Rankings")
# TopN series
if include_series and not series.empty:
series.to_excel(writer, sheet_name="TopN_Series", index=False)
# Params, Universe, Columns, Diagnostics
pd.DataFrame.from_dict(params, orient="index",
columns=["Value"]).to_excel(writer, sheet_name="Params")
pd.DataFrame({"Contract": universe}).to_excel(writer,
sheet_name="Universe", index=False)
pd.DataFrame({"Available Columns (prototype)":
list(colmap.keys())}).to_excel(writer, sheet_name="Columns", index=False)
diag_df.to_excel(writer, sheet_name="Diagnostics", index=False,
startrow=0)
if not diag_drop.empty:
diag_drop.to_excel(writer, sheet_name="Diagnostics", index=False,
startrow=len(diag_df)+2)
# Optional formatting
if engine_kwargs.get("engine") == "xlsxwriter" and apply_xlsx_fmt:
wb = writer.book
# Formats
fmt_pct = wb.add_format({"num_format": "0.0"})
fmt_num = wb.add_format({"num_format": "0.0000"})
fmt_rank = wb.add_format({"num_format": "0"})
# Rankings worksheet formatting
ws = writer.sheets["Rankings"]
ws.freeze_panes(1, 1)
# set widths
widths = {
"A": 28, # Contract (index col becomes A)
"B": 24, # Metric
"C": 14, "D": 14, "E": 14, # Value cols
"F": 18, # As of
"G": 10, # Rank
"H": 14, # CS_Percentile
"I": 12, # CS_Z
"J": 10, # CS_Bucket
"K": 16, # Self_Strength
"L": 12, # Self_Z
}
for col, w in widths.items():
try:
ws.set_column(f"{col}:{col}", w)
except Exception:
pass
# number formats
# Value/Value_forRank/Value_ranked
for col_letter in ["C","D","E","I","L"]:
try:
ws.set_column(f"{col_letter}:{col_letter}", None, fmt_num)
except Exception:
pass
try:
ws.set_column("H:H", None, fmt_pct) # CS_Percentile
ws.set_column("G:G", None, fmt_rank) # Rank
except Exception:
pass
# Heatmap-ish conditional formatting on percentile (higher=better)
last_row = len(ranks_for_excel) + 1
try:
ws.conditional_format(1, 7, last_row, 7, {"type":"3_color_scale"})
ws.conditional_format(1, 10, last_row, 10, {"type":"3_color_scale"})
# Self_Strength
except Exception:
pass
print(f"\nSaved Excel → {excel_out}")
print("Done ✔")
# Auto-run the wizard when executed in a notebook cell
if __name__ == "__main__" or True:
run_wizard()