0% found this document useful (0 votes)

13 views18 pages

COT Data

Uploaded by

Jupiter's String

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as RTF, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

13 views18 pages

COT Data

Uploaded by

Jupiter's String

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as RTF, PDF, TXT or read online on Scribd

You are on page 1/ 18

# === CFTC COT → Excel Leaderboard (TEXT WIZARD, dynamic columns, disk

cache) ===
# - Text prompts only (works in notebook/terminal).
# - Discovers all available columns dynamically from the chosen report.
# - Lets you pick ANY column, transform it, and rank cross-sectionally using last
week or last N weeks.
# - Minimal downloads + persistent on-disk cache (survives kernel restarts).
# - Outputs clean Excel with Rankings + Params (+ optional TopN series).
# ------------------------------------------------------------------------------

# 0) Limit BLAS threads (helps stability). Do this BEFORE importing numpy/pandas.

import os
for k in
["OMP_NUM_THREADS","OPENBLAS_NUM_THREADS","MKL_NUM_THREADS",
"NUMEXPR_NUM_THREADS"]:
os.environ.setdefault(k, "1")

from future import annotations

import re, warnings, time, json, math, pathlib, sys
from typing import List, Tuple, Dict, OrderedDict as OD_T
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor, as_completed

# 1) Optional progress bar

try:
from tqdm import tqdm
_HAS_TQDM = True
except Exception:
_HAS_TQDM = False

# 2) Core libs
import numpy as np
import pandas as pd

# 3) Optional HTTP cache (speeds up repeat CFTC fetches this session)

try:
import requests_cache
requests_cache.install_cache("cot_http_cache", expire_after=3600)
except Exception:
pass

# 4) pycot import (supports both spellings)

try:
from pycot.reports import CommitmentOfTraders as COT
except Exception:
from pycot.reports import CommitmentsOfTraders as COT

# Quiet down noisy dateutil warning from pycot

warnings.filterwarnings(
"ignore",
message="Could not infer format, so each element will be parsed individually,
falling back to `dateutil`.",
module="pycot.reports",
)

# ---------------------- Disk cache ----------------------

CACHE_DIR = pathlib.Path("./cot_cache_v2")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

def _safe_name(txt: str) -> str:

return re.sub(r"[^A-Za-z0-9_.-]+", "_", txt.strip())

def _df_cache_path(report_code: str, contract: str) -> pathlib.Path:

return CACHE_DIR /
f"{_safe_name(report_code)}__{_safe_name(contract)}.parquet"

def _meta_path(report_code: str) -> pathlib.Path:

return CACHE_DIR / f"{_safe_name(report_code)}__meta.json"

def _choices_path(report_code: str) -> pathlib.Path:

return CACHE_DIR / f"{_safe_name(report_code)}__last_choices.json"

def _load_cached_df(report_code: str, contract: str) -> pd.DataFrame | None:

p = _df_cache_path(report_code, contract)
if not p.exists(): return None
try:
df = pd.read_parquet(p)
df.index = pd.to_datetime(df.index).tz_localize(None)
return df
except Exception:
# fallback: maybe it was saved as pickle earlier
try:
df = pd.read_pickle(p.with_suffix(".pkl"))
df.index = pd.to_datetime(df.index).tz_localize(None)
return df
except Exception:
return None

def _save_cached_df(report_code: str, contract: str, df: pd.DataFrame):

p = _df_cache_path(report_code, contract)
try:
df.to_parquet(p)
except Exception:
df.to_pickle(p.with_suffix(".pkl"))

def _load_meta(report_code: str) -> dict:

mp = _meta_path(report_code)
if mp.exists():
try:
return json.loads(mp.read_text())
except Exception:
return {}
return {}

def _save_meta(report_code: str, meta: dict):

_meta_path(report_code).write_text(json.dumps(meta, indent=2, default=str))

def _save_choices(report_code: str, choices: dict):

_choices_path(report_code).write_text(json.dumps(choices, indent=2,
default=str))

# ---------------------- Helpers ----------------------

def _norm(txt: str) -> str:
return re.sub(r"\s+", " ", str(txt).strip().lower())

def _find_col(df, must_include: List[str], must_exclude: List[str] | None = None):

must_exclude = must_exclude or []
for c in df.columns:
lc = _norm(c)
if all(s in lc for s in must_include) and not any(x in lc for x in must_exclude):
return c
return None

def _get_OI(df):
for key in ["open interest","openinterest","oi"]:
c = _find_col(df, [key])
if c: return df[c].astype(float)
if "OI" in df.columns: return df["OI"].astype(float)
raise KeyError("Open Interest column not found.")

def _match_any_synonym(df, keys_any: List[str]):

packs = [
["commercial"],
["producer","merchant","processor","user"], ["prod","merch","proc","user"],
["noncommercial"], ["large","spec"], ["managed","money"],
["dealer"], ["asset","manager"], ["leveraged","funds"], ["swap","dealer"],
["other","reportable"],
]
# Prefer Net %OI if present
for pack in packs:
if not any(k in pack for k in keys_any): continue
pct = _find_col(df, ["net % of oi", *pack])
if pct: return {"net_pct": df[pct].astype(float), "tokens": pack}
# Fallback: long/short pairs
for pack in packs:
if not any(k in pack for k in keys_any): continue
Lc = _find_col(df, ["long", *pack], must_exclude=["%"])
Sc = _find_col(df, ["short",*pack], must_exclude=["%"])
if Lc and Sc:
return {"long": df[Lc].astype(float), "short": df[Sc].astype(float), "tokens":
pack}
return None

def _get_net_pct(df, keys_any: List[str]):

hit = _match_any_synonym(df, keys_any)
if not hit: return None
if "net_pct" in hit: return hit["net_pct"]
OI = _get_OI(df); eps = 1e-9
return 100.0 * ((hit["long"] - hit["short"]) / (OI + eps))

# Rich column enumeration (label → Series)

_COL_REGEX = re.compile(r"(?i)^(?P<kind>net % of oi|net(?:\s*change)?|longs?|
shorts?)" r"(?:\s*[-,:]?\s*)?(?P<group>.+)$")

def _norm_group(raw: str, report_code: str) -> str:

g = _norm(raw)
if "producer/merchant/processor/user" in g or all(k in g for k in
["prod","merch","proc","user"]):
return "Commercial (PM/P/U)"
if "managed money" in g: return "Managed Money"
if "swap dealer" in g: return "Swap Dealers"
if "other report" in g: return "Other Reportables"
if "noncommercial" in g or "large spec" in g: return "Speculators"
if "dealer" in g: return "Dealer"
if "asset" in g and "manager" in g: return "Asset Manager"
if "leveraged" in g and "fund" in g: return "Leveraged Funds"
if "non" in g and "reportable" in g: return "Non-reportable"
if "commercial" in g: return "Commercial"
return raw.strip()

def enumerate_cot_columns(df: pd.DataFrame, report_code: str) -> "OrderedDict[str,

pd.Series]":
out = OrderedDict()
# Open interest
try:
out["Open Interest"] = _get_OI(df)
except Exception:
pass

# Patterned: Net %OI / Net / Net Change / Long / Short

for col in df.columns:
cstr = str(col).strip()
m = _COL_REGEX.match(cstr)
if not m: continue
kind = m.group("kind").lower()
group_lbl = _norm_group(m.group("group"), report_code)
s = df[col].astype(float)
if "net % of oi" in kind:
label = f"{group_lbl} — Net %OI"
elif "net change" in kind:
label = f"{group_lbl} — Net Change"
elif kind.startswith("net"):
label = f"{group_lbl} — Net (contracts)"
elif kind.startswith("long"):
label = f"{group_lbl} — Long"
else:
label = f"{group_lbl} — Short"
out[label] = s

# Synthesize common %OI if missing

if "Commercial — Net %OI" not in out:
s = _get_net_pct(df, ["commercial","producer","prod"])
if s is not None: out["Commercial — Net %OI"] = s
if "Speculators — Net %OI" not in out:
s = _get_net_pct(df, ["noncommercial","large spec","managed
money","leveraged","funds","asset","manager"])
if s is not None: out["Speculators — Net %OI"] = s

# Any remaining numeric columns (to be safe)

for col in df.columns:
name = str(col)
if name.lower() == "contract name": continue
if name in out: continue
s = df[col]
if pd.api.types.is_numeric_dtype(s):
out[name] = s.astype(float)
return out

# ---------------------- Transforms & CS aggregations ----------------------

def _zscore(x: pd.Series, win: int) -> pd.Series:
m = x.rolling(win).mean(); s = x.rolling(win).std(ddof=0)
return (x - m) / (s.replace(0, np.nan) + 1e-9)

def _range_index(x: pd.Series, win: int) -> pd.Series:

mn = x.rolling(win).min(); mx = x.rolling(win).max()
return 100.0 * (x - mn) / ((mx - mn).replace(0, np.nan) + 1e-9)

TRANSFORMS = OrderedDict([
("Level", ("lvl", None)), # (code, default window)
("Index (156w)", ("idx", 156)),
("Z-score", ("z", 156)), # window chosen interactively
("Δ over N", ("dN", None)), # N chosen interactively
])

CS_AGGS = OrderedDict([
("Latest (last 1)", ("last", 1)),
("Average of last N", ("avgN", 4)),
("Sum of last N", ("sumN", 4)),
("Median of last N", ("medN", 4)),
("Change over last N", ("chgN", 4)),
("Min over last N", ("minN", 4)),
("Max over last N", ("maxN", 4)),
])

def apply_transform(s: pd.Series, code: str, win: int | None, N_for_delta: int | None) -
> pd.Series:
code = code.lower()
if code == "lvl":
return s
if code == "idx":
w = int(win or 156)
mn=s.rolling(w).min(); mx=s.rolling(w).max()
return 100.0 * (s - mn) / ((mx - mn).replace(0, np.nan) + 1e-9)
if code == "z":
w = int(win or 156)
return _zscore(s, w)
if code == "dN":
k = int(N_for_delta or 4)
return s - s.shift(k)
return s

def cs_aggregate(s: pd.Series, agg_code: str, N: int) -> float | None:

ss = s.dropna()
if ss.empty: return None
if agg_code == "last":
return float(ss.iloc[-1])
tail = ss.tail(N)
if tail.empty: return None
if agg_code == "avgN": return float(tail.mean())
if agg_code == "sumN": return float(tail.sum())
if agg_code == "medN": return float(tail.median())
if agg_code == "chgN": return float(tail.iloc[-1] - tail.iloc[0])
if agg_code == "minN": return float(tail.min())
if agg_code == "maxN": return float(tail.max())
return float(tail.iloc[-1])

# ---------------------- Extra utilities (new) ----------------------

def winsorize_scalar(x: float, lo: float, hi: float) -> float:
return max(lo, min(hi, x))

def percentile_rank(values: pd.Series) -> pd.Series:

# 0..100 inclusive
n = len(values)
if n <= 1:
return pd.Series([100.0]*n, index=values.index, dtype=float)
ranks = values.rank(method="min", ascending=True) # small→low
return 100.0 * (ranks - 1) / (n - 1)

def decile_bucket(pct: float) -> str:

if pct is None or np.isnan(pct): return ""
d = int(np.floor(pct / 10.0)) # 0..9
return f"D{d+1}"

# ---------------------- Data access (cached + trimmed) ----------------------

# Policy: "reuse" → use disk cache if present; "check" → refresh if cache is older
than last known CFTC date; "clear" → ignore & overwrite.
REFRESH_POLICIES = {"reuse","check","clear"}

def list_contracts(report_type: str) -> List[str]:

raw = COT(report_type).list_available_contracts()
return [str(x) for x in list(raw)]

def _latest_cftc_date(df: pd.DataFrame) -> pd.Timestamp | None:

if df is None or df.empty: return None
return pd.to_datetime(df.index.max()).tz_localize(None)

def _trim_weeks(df: pd.DataFrame, need_weeks: int) -> pd.DataFrame:

if df is None or df.empty: return df
cutoff = pd.Timestamp.today().tz_localize(None) -
pd.DateOffset(weeks=int(need_weeks))
return df.loc[df.index >= cutoff]

def fetch_cot_df(report_type: str, contract: str, *, need_weeks: int, policy: str="check")

-> pd.DataFrame:
policy = (policy or "check").lower()
cached = _load_cached_df(report_type, contract)
if policy == "reuse" and cached is not None:
return _trim_weeks(cached, need_weeks)

if policy == "check" and cached is not None:

# We'll still use cached; pycot layer + requests_cache will avoid heavy traffic
anyway.
return _trim_weeks(cached, need_weeks)

# "clear" OR no cache → fetch

cot = COT(report_type)
df = cot.report((contract,)).sort_index()
df.index = pd.to_datetime(df.index).tz_localize(None)
_save_cached_df(report_type, contract, df)
return _trim_weeks(df, need_weeks)

# ---------------------- Wizard I/O ----------------------

def ask_choice(prompt: str, options: List[str], default_idx: int = 0) -> int:
while True:
print(f"\n{prompt}")
for i, opt in enumerate(options, 1):
dmark = " (default)" if (i-1) == default_idx else ""
print(f" {i}. {opt}{dmark}")
raw = input(f"Enter 1-{len(options)} (Enter={default_idx+1}): ").strip()
if raw == "":
return default_idx
if raw.isdigit():
k = int(raw) - 1
if 0 <= k < len(options):
return k
print("Invalid choice, try again.")

def ask_int(prompt: str, default: int, min_v: int = 1, max_v: int | None = None) -> int:
while True:
raw = input(f"{prompt} (default {default}): ").strip()
if raw == "":
return default
try:
v = int(raw)
if v < min_v:
print(f"Must be ≥ {min_v}")
continue
if max_v is not None and v > max_v:
print(f"Must be ≤ {max_v}")
continue
return v
except Exception:
print("Enter an integer.")

def ask_float(prompt: str, default: float, min_v: float = -1e9, max_v: float = 1e9) ->
float:
while True:
raw = input(f"{prompt} (default {default}): ").strip()
if raw == "":
return default
try:
v = float(raw)
if v < min_v or v > max_v:
print(f"Must be between {min_v} and {max_v}")
continue
return v
except Exception:
print("Enter a number.")

def ask_str(prompt: str, default: str = "") -> str:

raw = input(f"{prompt} (Enter for '{default}'): ").strip()
return raw if raw != "" else default

def ask_yesno(prompt: str, default_yes: bool = True) -> bool:

d = "Y/n" if default_yes else "y/N"
while True:
raw = input(f"{prompt} ({d}): ").strip().lower()
if raw == "" : return default_yes
if raw in ["y","yes"]: return True
if raw in ["n","no"]: return False
print("Please answer y/n.")

# ---------------------- Leaderboard compute ----------------------

def compile_leaderboard_dynamic(
*,
report_code: str,
universe: List[str],
picked_label: str,
transform_code: str,
transform_win: int | None,
delta_N: int | None,
cs_agg_code: str,
cs_N: int,
rank_desc: bool,
need_weeks: int,
refresh_policy: str = "check",
max_workers: int = 1,
# new:
flip_sign: bool = False,
winsor_lo: float | None = None,
winsor_hi: float | None = None,
self_z_win: int = 156,
self_idx_win: int = 156,
) -> pd.DataFrame:
rows: List[dict] = []
dropped: List[str] = []

def _one(name: str):

try:
df = fetch_cot_df(report_code, name, need_weeks=need_weeks,
policy=refresh_policy)
colmap = enumerate_cot_columns(df, report_code)
if picked_label not in colmap:
return None
s = colmap[picked_label].astype(float)
sT = apply_transform(s, transform_code, transform_win, delta_N)
sT = sT.dropna()
if sT.empty:
return None
# self metrics on transformed series
z_self = _zscore(sT, int(self_z_win)).iloc[-1] if len(sT) >= max(8,
self_z_win//4) else np.nan
idx_self = _range_index(sT, int(self_idx_win)).iloc[-1] if len(sT) >= max(8,
self_idx_win//4) else np.nan
val = cs_aggregate(sT, cs_agg_code, cs_N)
if val is None:
return None
if flip_sign:
val = -float(val)
last_dt = pd.to_datetime(sT.index[-1])
return {
"Contract": name,
"Metric": picked_label,
"Value_raw": float(val if not flip_sign else -val), # original, before
sign flip
"Value_eff": float(val), # effective used for ranking (after flip)
"Self_Z": float(z_self) if z_self is not None else np.nan,
"Self_Strength": float(idx_self) if idx_self is not None else np.nan,
"As of": last_dt,
}
except Exception:
dropped.append(name)
return None

total = len(universe)
if int(max_workers) <= 1:
it = tqdm(universe, desc="Scanning", unit="c") if _HAS_TQDM else universe
for n in it:
row = _one(n)
if row: rows.append(row)
else:
with ThreadPoolExecutor(max_workers=int(max_workers)) as ex:
futures = {ex.submit(_one, n): n for n in universe}
if _HAS_TQDM:
pbar = tqdm(total=total, desc="Scanning", unit="c")
for fut in as_completed(futures):
row = fut.result()
if row: rows.append(row)
pbar.update(1)
pbar.close()
else:
done = 0
for fut in as_completed(futures):
row = fut.result()
if row: rows.append(row)
done += 1
if done == 1 or done % 25 == 0 or done == total:
print(f"[{done}/{total}]")

if not rows:
return pd.DataFrame(columns=["Metric","Value","As of","Rank"],
index=pd.Index([], name="Contract"))

df = pd.DataFrame(rows).set_index("Contract")

# optional winsorization on Effective Value (cross-sectional stability)

if winsor_lo is not None or winsor_hi is not None:
lo = df["Value_eff"].quantile(0.01 if winsor_lo is None else winsor_lo)
hi = df["Value_eff"].quantile(0.99 if winsor_hi is None else winsor_hi)
df["Value_eff_w"] = df["Value_eff"].clip(lower=lo, upper=hi)
else:
df["Value_eff_w"] = df["Value_eff"]

# Cross-sectional stats
# Rank: 1 is best if rank_desc=True
df = df.sort_values("Value_eff_w", ascending=not rank_desc)
df["Rank"] = df["Value_eff_w"].rank(ascending=not rank_desc, method="min")
# Percentile & CS-Z
# Use Value_eff_w as the population for CS stats
vals = df["Value_eff_w"]
df["CS_Percentile"] = percentile_rank(vals if rank_desc else -vals) # top gets
100
mu, sd = float(vals.mean()), float(vals.std(ddof=0) or 1.0)
df["CS_Z"] = (vals - mu) / sd
df["CS_Bucket"] = [decile_bucket(p) for p in df["CS_Percentile"]]

# keep classic columns + new ones (preserve your original columns)

out = df[[
"Metric",
"Value_raw", # original sign before flip
"Value_eff", # after sign flip (used)
"Value_eff_w", # winsorized effective (ranked)
"As of",
"Rank",
"CS_Percentile",
"CS_Z",
"CS_Bucket",
"Self_Strength",
"Self_Z",
]].rename(columns={"Value_raw": "Value", "Value_eff": "Value_forRank",
"Value_eff_w": "Value_ranked"})

out.index.name = "Contract"

# Attach diagnostics
out._diagnostics = {
"dropped_count": len(dropped),
"dropped_list": dropped,
"mu": mu,
"sd": sd,
}
return out

def gather_top_series(
*,
report_code: str, universe_ranked: pd.Index, picked_label: str,
transform_code: str, transform_win: int | None, delta_N: int | None,
need_weeks: int, refresh_policy: str, top_n: int = 20
) -> pd.DataFrame:
keep = list(universe_ranked[:max(0, int(top_n))])
frames = []
it = tqdm(keep, desc="Collecting series", unit="c") if _HAS_TQDM else keep
for name in it:
try:
df = fetch_cot_df(report_code, name, need_weeks=need_weeks,
policy=refresh_policy)
colmap = enumerate_cot_columns(df, report_code)
if picked_label not in colmap:
continue
s = colmap[picked_label].astype(float).dropna()
if s.empty: continue
sT = apply_transform(s, transform_code, transform_win,
delta_N).dropna()
if sT.empty: continue
tmp = sT.reset_index(); tmp.columns = ["Date", "Value"]
tmp["Contract"] = name; tmp["Metric"] = picked_label
frames.append(tmp[["Date","Contract","Metric","Value"]])
except Exception as e:
warnings.warn(f"series {name}: {e}")
if not frames:
return pd.DataFrame(columns=["Date","Contract","Metric","Value"])
return pd.concat(frames,
axis=0).sort_values(["Contract","Date"]).reset_index(drop=True)

# ---------------------- TEXT WIZARD ----------------------

def run_wizard():
print("\n=== CFTC COT Leaderboard — Text Wizard ===")

# 1) Report
report_options = [
("Legacy (Futures only)", "legacy_fut"),
("Disaggregated (Fut+Opt)", "disaggregated_futopt"),
("TFF (Financials, Futures)", "traders_in_financial_futures_fut"),
]
i_rep = ask_choice("Choose CFTC report:", [x[0] for x in report_options],
default_idx=2)
report_code = report_options[i_rep][1]

# 2) Universe filter
print(f"\nFetching contract list for: {report_code}")
universe = list_contracts(report_code)
print(f"Total contracts: {len(universe)}")
filt = ask_str("Filter substring for contracts (case-insensitive)", default="")
if filt:
q = filt.lower().strip()
universe = [c for c in universe if q in c.lower()]
print(f"Universe size after filter: {len(universe)}")
if len(universe) == 0:
print("No contracts match your filter. Exiting.")
return
# 3) Prototype load just once to discover ALL columns for this report
print("Sampling first contract to enumerate columns...")
proto_name = universe[0]
proto_df = fetch_cot_df(report_code, proto_name, need_weeks=520,
policy="check") # a couple of years to ensure patterns
colmap = enumerate_cot_columns(proto_df, report_code)
col_labels = list(colmap.keys())

# 4) Let the user choose a column (dynamic!)

i_col = ask_choice("Pick a column/metric to derive and rank:", col_labels,
default_idx=0)
picked_label = col_labels[i_col]
print(f"→ You chose: {picked_label}")

# 5) Transform (pre-CS)
tf_labels = list(TRANSFORMS.keys())
i_tf = ask_choice("Choose a transform to apply before ranking:", tf_labels,
default_idx=0)
tf_code, tf_defwin = TRANSFORMS[tf_labels[i_tf]]
tf_win = None; dN = None
if tf_code == "z":
tf_win = ask_int("Window (weeks) for Z-score", default=int(tf_defwin or 156),
min_v=26, max_v=520)
elif tf_code == "idx":
tf_win = ask_int("Window (weeks) for rolling Index", default=int(tf_defwin or
156), min_v=26, max_v=520)
elif tf_code == "dN":
dN = ask_int("Δ over N weeks — choose N", default=4, min_v=1,
max_v=104)

# 6) Cross-sectional aggregation window & operator

cs_labels = list(CS_AGGS.keys())
i_cs = ask_choice("Choose cross-sectional aggregator over the last N weeks:",
cs_labels, default_idx=0)
cs_code, cs_defN = CS_AGGS[cs_labels[i_cs]]
cs_N = cs_defN if cs_code == "last" else ask_int("N (weeks) for cross-sectional
aggregator", default=cs_defN, min_v=1, max_v=260)

# 7) Rank direction
rank_desc = ask_yesno("Rank with higher values on top?", default_yes=True)

# 7b) Flip sign before ranking? (for metrics where lower is 'better')
flip_sign = ask_yesno("Flip sign BEFORE ranking? (choose 'yes' when more
negative = better)", default_yes=False)

# 7c) Winsorization (optional)

do_winsor = ask_yesno("Winsorize cross-sectional values before ranking?
(stabilizes outliers)", default_yes=False)
winsor_lo = winsor_hi = None
if do_winsor:
winsor_lo = ask_float("Lower quantile (e.g., 0.02 for 2%)", default=0.02,
min_v=0.0, max_v=0.49)
winsor_hi = ask_float("Upper quantile (e.g., 0.98 for 98%)", default=0.98,
min_v=0.51, max_v=1.0)

# 8) Self windows (relative-to-itself metrics)

self_idx_win = ask_int("Window (weeks) for Self Strength (range index 0..100)",
default=156, min_v=26, max_v=520)
self_z_win = ask_int("Window (weeks) for Self Z-score", default=156,
min_v=26, max_v=520)

# 9) Download/compute window needed (keep minimal)

need_weeks = max(
8,
(int(tf_win) if tf_code in {"z","idx"} and tf_win else 0),
(int(dN) if dN else 0),
int(cs_N),
int(self_idx_win),
int(self_z_win),
)
print(f"\nHistory window requested: ~{need_weeks} weeks (auto-trimmed per
contract).")

# 10) Cache policy & workers

rp = ask_choice("Refresh policy for disk cache", ["reuse (fastest)","check
(default)","clear (re-download)"], default_idx=1)
refresh_policy = ["reuse","check","clear"][rp]
max_workers = ask_int("Max workers (1 safest; 2–4 ok on stable setup)",
default=1, min_v=1, max_v=8)

# 11) Excel + series options

excel_out = ask_str("Excel output filename",
default="cot_universe_leaderboard.xlsx")
include_series = ask_yesno("Also export Top-N time series sheet?",
default_yes=False)
top_n_series = 20
if include_series:
top_n_series = ask_int("Top-N contracts to include as time series",
default=20, min_v=1, max_v=200)

# 12) (Optional) limit universe for a quick dry-run

if ask_yesno("Limit universe count for a quick run?", default_yes=False):
cap = ask_int("Keep first K contracts", default=25, min_v=1,
max_v=len(universe))
universe = universe[:cap]

# 13) Optional Excel formatting

apply_xlsx_fmt = ask_yesno("Apply Excel formatting (freeze panes, widths,
heatmaps)?", default_yes=True)
# Persist choices for reproducibility
_save_choices(report_code, {
"report_code": report_code,
"filter": filt or "",
"picked_label": picked_label,
"transform": {"code": tf_code, "win": tf_win, "deltaN": dN},
"cs": {"code": cs_code, "N": cs_N, "rank_desc": rank_desc},
"flip_sign": flip_sign,
"winsor": {"lo": winsor_lo, "hi": winsor_hi} if do_winsor else None,
"self_windows": {"strength": self_idx_win, "z": self_z_win},
"need_weeks": need_weeks,
"refresh_policy": refresh_policy,
"max_workers": max_workers,
"excel_out": excel_out,
"include_series": include_series,
"top_n_series": top_n_series,
"timestamp": pd.Timestamp.now().isoformat(),
})

# ---------------- RUN ----------------

print(time.strftime("%d-%b-%y %H:%M:%S"), f"- Building leaderboard for
report={report_code}")
start = time.time()
ranks = compile_leaderboard_dynamic(
report_code=report_code,
universe=universe,
picked_label=picked_label,
transform_code=tf_code,
transform_win=tf_win,
delta_N=dN,
cs_agg_code=cs_code,
cs_N=cs_N,
rank_desc=rank_desc,
need_weeks=need_weeks,
refresh_policy=refresh_policy,
max_workers=max_workers,
flip_sign=flip_sign,
winsor_lo=winsor_lo, winsor_hi=winsor_hi,
self_z_win=self_z_win, self_idx_win=self_idx_win,
)
took = time.time() - start
print(f"Leaderboard built in {took:.1f}s | Ranked rows: {len(ranks)}")

# Series (optional)
series = pd.DataFrame(columns=["Date","Contract","Metric","Value"])
if include_series and len(ranks) > 0:
print("Collecting Top-N series…")
series = gather_top_series(
report_code=report_code, universe_ranked=ranks.index,
picked_label=picked_label,
transform_code=tf_code, transform_win=tf_win, delta_N=dN,
need_weeks=max(need_weeks, self_idx_win, self_z_win),
refresh_policy=refresh_policy, top_n=top_n_series
)
print(f"Series rows: {len(series)}")

# Params sheet
params = OrderedDict([
("Report", report_code),
("Universe count", len(universe)),
("Picked column", picked_label),
("Transform code", tf_code),
("Transform window", int(tf_win) if tf_win else 0),
("Delta N (for Δ)", int(dN) if dN else 0),
("CS agg", cs_code),
("CS N", int(cs_N)),
("Rank descending?", bool(rank_desc)),
("Flip sign (pre-rank)", bool(flip_sign)),
("Winsor lo", winsor_lo if winsor_lo is not None else ""),
("Winsor hi", winsor_hi if winsor_hi is not None else ""),
("Self Strength win", int(self_idx_win)),
("Self Z win", int(self_z_win)),
("Need weeks (history)", int(need_weeks)),
("Refresh policy", refresh_policy),
("Max workers", int(max_workers)),
("Include series", bool(include_series)),
("TopN series", int(top_n_series if include_series else 0)),
("Generated at", pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")),
])

# Diagnostics sheet info

di = getattr(ranks, "_diagnostics", {"dropped_count": 0, "dropped_list": [], "mu":
np.nan, "sd": np.nan})
diag_rows = [
("Dropped (no data / errors)", di.get("dropped_count", 0)),
("Population mean (ranked value)", di.get("mu", np.nan)),
("Population std (ranked value)", di.get("sd", np.nan)),
]
diag_df = pd.DataFrame(diag_rows, columns=["Key","Value"])
if di.get("dropped_list"):
diag_drop = pd.DataFrame({"Dropped Contracts": di["dropped_list"]})
else:
diag_drop = pd.DataFrame({"Dropped Contracts": []})

# Save Excel (with optional formatting)

engine_kwargs = {}
try:
import xlsxwriter # noqa
engine_kwargs["engine"] = "xlsxwriter"
except Exception:
pass

with pd.ExcelWriter(excel_out, **engine_kwargs) as writer:

# Rankings
ranks_for_excel = ranks.copy()
# Pretty rounding for readability
for col in
["Value","Value_forRank","Value_ranked","CS_Percentile","CS_Z","Self_Strength","S
elf_Z"]:
if col in ranks_for_excel.columns:
ranks_for_excel[col] = pd.to_numeric(ranks_for_excel[col],
errors="coerce").round(4)
ranks_for_excel.to_excel(writer, sheet_name="Rankings")

# TopN series
if include_series and not series.empty:
series.to_excel(writer, sheet_name="TopN_Series", index=False)

# Params, Universe, Columns, Diagnostics

pd.DataFrame.from_dict(params, orient="index",
columns=["Value"]).to_excel(writer, sheet_name="Params")
pd.DataFrame({"Contract": universe}).to_excel(writer,
sheet_name="Universe", index=False)
pd.DataFrame({"Available Columns (prototype)":
list(colmap.keys())}).to_excel(writer, sheet_name="Columns", index=False)
diag_df.to_excel(writer, sheet_name="Diagnostics", index=False,
startrow=0)
if not diag_drop.empty:
diag_drop.to_excel(writer, sheet_name="Diagnostics", index=False,
startrow=len(diag_df)+2)

# Optional formatting
if engine_kwargs.get("engine") == "xlsxwriter" and apply_xlsx_fmt:
wb = writer.book
# Formats
fmt_pct = wb.add_format({"num_format": "0.0"})
fmt_num = wb.add_format({"num_format": "0.0000"})
fmt_rank = wb.add_format({"num_format": "0"})
# Rankings worksheet formatting
ws = writer.sheets["Rankings"]
ws.freeze_panes(1, 1)
# set widths
widths = {
"A": 28, # Contract (index col becomes A)
"B": 24, # Metric
"C": 14, "D": 14, "E": 14, # Value cols
"F": 18, # As of
"G": 10, # Rank
"H": 14, # CS_Percentile
"I": 12, # CS_Z
"J": 10, # CS_Bucket
"K": 16, # Self_Strength
"L": 12, # Self_Z
}
for col, w in widths.items():
try:
ws.set_column(f"{col}:{col}", w)
except Exception:
pass
# number formats
# Value/Value_forRank/Value_ranked
for col_letter in ["C","D","E","I","L"]:
try:
ws.set_column(f"{col_letter}:{col_letter}", None, fmt_num)
except Exception:
pass
try:
ws.set_column("H:H", None, fmt_pct) # CS_Percentile
ws.set_column("G:G", None, fmt_rank) # Rank
except Exception:
pass
# Heatmap-ish conditional formatting on percentile (higher=better)
last_row = len(ranks_for_excel) + 1
try:
ws.conditional_format(1, 7, last_row, 7, {"type":"3_color_scale"})
ws.conditional_format(1, 10, last_row, 10, {"type":"3_color_scale"})
# Self_Strength
except Exception:
pass

print(f"\nSaved Excel → {excel_out}")

print("Done ✔")

# Auto-run the wizard when executed in a notebook cell

if __name__ == "__main__" or True:
run_wizard()

COT Datjdjdjsjsjsjsjsjsjjsjsjsjsjsjsjsjsjsjjss
No ratings yet
COT Datjdjdjsjsjsjsjsjsjjsjsjsjsjsjsjsjsjsjjss
29 pages
1 4-EDA Ipynb
No ratings yet
1 4-EDA Ipynb
12 pages
ELT Using Pandas
No ratings yet
ELT Using Pandas
5 pages
ETL Report Json DB
No ratings yet
ETL Report Json DB
6 pages
EDA With Pandas CheatSheet
No ratings yet
EDA With Pandas CheatSheet
3 pages
Python & Data Science Cheat Sheet
100% (4)
Python & Data Science Cheat Sheet
11 pages
Cleaning Data
No ratings yet
Cleaning Data
6 pages
EDS - Python Cheat Sheet
0% (1)
EDS - Python Cheat Sheet
3 pages
Codigo Phyton
No ratings yet
Codigo Phyton
8 pages
Time Series Forecasting Jupyter Code - Ipynb
No ratings yet
Time Series Forecasting Jupyter Code - Ipynb
2,484 pages
Cheat Sheet - Pandas
No ratings yet
Cheat Sheet - Pandas
6 pages
Cleaning Data in Python
No ratings yet
Cleaning Data in Python
8 pages
Python & Pandas Cheat Sheet Guide
100% (2)
Python & Pandas Cheat Sheet Guide
5 pages
Pandas Data Manipulation Extended CheatSheet 1731972219
No ratings yet
Pandas Data Manipulation Extended CheatSheet 1731972219
9 pages
Python Cheat Sheet 2.0
100% (2)
Python Cheat Sheet 2.0
10 pages
Pandas Cheat Sheet Serves
No ratings yet
Pandas Cheat Sheet Serves
20 pages
Pandas Trampas
No ratings yet
Pandas Trampas
9 pages
Data Preprocessing 2
No ratings yet
Data Preprocessing 2
5 pages
Data Visualization & Preprocessing Guide
No ratings yet
Data Visualization & Preprocessing Guide
18 pages
Pandas Syntax Revision For ML
No ratings yet
Pandas Syntax Revision For ML
10 pages
PySpark Cheatsheet - Elaborate
No ratings yet
PySpark Cheatsheet - Elaborate
14 pages
Data Science Cheat Sheet: KEY Imports
100% (1)
Data Science Cheat Sheet: KEY Imports
1 page
My Own Cheatsheet
No ratings yet
My Own Cheatsheet
13 pages
Pandas Operations Guide
No ratings yet
Pandas Operations Guide
6 pages
Project Intern - Jupyter Notebook
No ratings yet
Project Intern - Jupyter Notebook
16 pages
Prototype 13
No ratings yet
Prototype 13
1 page
Loan Default Prediction System
No ratings yet
Loan Default Prediction System
13 pages
Pandas Cheat Sheet PDF
67% (3)
Pandas Cheat Sheet PDF
1 page
Python Cheat Sheet Code Academy
100% (1)
Python Cheat Sheet Code Academy
1 page
Interactive Data Analysis With Jupyter Cheatsheet 1731972443
No ratings yet
Interactive Data Analysis With Jupyter Cheatsheet 1731972443
10 pages
Pandas Documentation PDF
No ratings yet
Pandas Documentation PDF
86 pages
CSC - 310 Advanced Python Programming Continuous Assessment-2 Assignment:Ca2
No ratings yet
CSC - 310 Advanced Python Programming Continuous Assessment-2 Assignment:Ca2
33 pages
Python Interview Cheat Sheet Moodys
No ratings yet
Python Interview Cheat Sheet Moodys
2 pages
Pandas For Python Pro Level Cheat Sheet
No ratings yet
Pandas For Python Pro Level Cheat Sheet
14 pages
Pandas Dataframe Cheat Sheet
No ratings yet
Pandas Dataframe Cheat Sheet
3 pages
Data Integration and Missing Values Analysis
No ratings yet
Data Integration and Missing Values Analysis
23 pages
Observation: As We Can See We Have Threwe Types of Datatypes I.E. (Int, Float, Object) That Means We Have Both Categorical and Numerical Data
No ratings yet
Observation: As We Can See We Have Threwe Types of Datatypes I.E. (Int, Float, Object) That Means We Have Both Categorical and Numerical Data
2 pages
File Cleaning
No ratings yet
File Cleaning
2 pages
Journal
No ratings yet
Journal
47 pages
Python Cheat Sheet: Pandas - Numpy - Sklearn Matplotlib - Seaborn BS4 - Selenium - Scrapy
100% (3)
Python Cheat Sheet: Pandas - Numpy - Sklearn Matplotlib - Seaborn BS4 - Selenium - Scrapy
9 pages
Data Engineer Interview 1740985064
No ratings yet
Data Engineer Interview 1740985064
14 pages
Set B
No ratings yet
Set B
8 pages
Social Network Analysis: Cheruvu Nvss Suhas 21BCE8374
No ratings yet
Social Network Analysis: Cheruvu Nvss Suhas 21BCE8374
10 pages
Python CheatSheet
No ratings yet
Python CheatSheet
2 pages
SQL Cheat Sheet Python
100% (1)
SQL Cheat Sheet Python
1 page
Exploratory Data Analysis BCG - Ipynb
No ratings yet
Exploratory Data Analysis BCG - Ipynb
273 pages
TCS Stock Data - Live and Latest-Checkpoint - Ipynb
No ratings yet
TCS Stock Data - Live and Latest-Checkpoint - Ipynb
172 pages
Practical File Questions With Answers
No ratings yet
Practical File Questions With Answers
7 pages
Numpy Boolean Indexing: Filter
No ratings yet
Numpy Boolean Indexing: Filter
39 pages
Pyspark Interview Questions
No ratings yet
Pyspark Interview Questions
4 pages
Data Wrangling & Data Manipulation With Pandas
No ratings yet
Data Wrangling & Data Manipulation With Pandas
6 pages
Basics of Pandas
No ratings yet
Basics of Pandas
5 pages
HUAWEI HiSecEngine USG6000F Series Firewalls - Fixed Configuration
No ratings yet
HUAWEI HiSecEngine USG6000F Series Firewalls - Fixed Configuration
13 pages
RL78-Instruction Set Manual
No ratings yet
RL78-Instruction Set Manual
204 pages
FRSky Delta-8
No ratings yet
FRSky Delta-8
2 pages
M3 (Mechatronics - Input & Outputs)
No ratings yet
M3 (Mechatronics - Input & Outputs)
11 pages
CLB10503 Principles of Programming Assignment: Movie Ticket Booking Programme (Using C++ Coding)
67% (3)
CLB10503 Principles of Programming Assignment: Movie Ticket Booking Programme (Using C++ Coding)
17 pages
HLR Configuration Mode Commands
No ratings yet
HLR Configuration Mode Commands
8 pages
ACS132-1321 - RCL Dynamics - Diamond Template - 1718
No ratings yet
ACS132-1321 - RCL Dynamics - Diamond Template - 1718
23 pages
Hifonics Brutus Elite BRE Amplifier Manual
No ratings yet
Hifonics Brutus Elite BRE Amplifier Manual
17 pages
A Multicast RPC Implementation For Java
No ratings yet
A Multicast RPC Implementation For Java
7 pages
Temenos Cloud Close of Business OPERATIONS MANUAL
No ratings yet
Temenos Cloud Close of Business OPERATIONS MANUAL
8 pages
Ab Initio Training
No ratings yet
Ab Initio Training
16 pages
Assignment 2 - 20172018
No ratings yet
Assignment 2 - 20172018
3 pages
High-Performance RF Signal Processing Solutions
No ratings yet
High-Performance RF Signal Processing Solutions
55 pages
ASRock - B85 Pro4
No ratings yet
ASRock - B85 Pro4
3 pages
SANYO 18.5" HDTV LCD TV Service Manual
No ratings yet
SANYO 18.5" HDTV LCD TV Service Manual
49 pages
Render Cache Optimization Guide
No ratings yet
Render Cache Optimization Guide
40 pages
HT7166
No ratings yet
HT7166
17 pages
Maguay MyWay P1703m Specs & Price
No ratings yet
Maguay MyWay P1703m Specs & Price
4 pages
CG8250 UserManual
No ratings yet
CG8250 UserManual
400 pages
Channel Codding Techniques
No ratings yet
Channel Codding Techniques
3 pages
Cogent Data Centers: Secure, Reliable Connectivity
No ratings yet
Cogent Data Centers: Secure, Reliable Connectivity
2 pages
Distributed Shared Memory Guide
No ratings yet
Distributed Shared Memory Guide
35 pages
FRENIC-HVAC Function
No ratings yet
FRENIC-HVAC Function
24 pages
DC Revision QB Unit 5 Question Bank
No ratings yet
DC Revision QB Unit 5 Question Bank
2 pages
8085 Microprocessor Overview
No ratings yet
8085 Microprocessor Overview
3 pages
Software Engineering Resume
No ratings yet
Software Engineering Resume
2 pages
Group 1: Introduction To Computers
No ratings yet
Group 1: Introduction To Computers
29 pages
Windows Container Virtualization Guide
100% (1)
Windows Container Virtualization Guide
48 pages
Computer Networks and MySQL - Notes
No ratings yet
Computer Networks and MySQL - Notes
16 pages
Example of Restore Database From Rman Backup On Netbackup in AIX
No ratings yet
Example of Restore Database From Rman Backup On Netbackup in AIX
13 pages

COT Data

Uploaded by

COT Data

Uploaded by

# === CFTC COT → Excel Leaderboard (TEXT WIZARD, dynamic columns, disk

# 0) Limit BLAS threads (helps stability). Do this BEFORE importing numpy/pandas.

from __future__ import annotations

# 1) Optional progress bar

# 3) Optional HTTP cache (speeds up repeat CFTC fetches this session)

# 4) pycot import (supports both spellings)

# Quiet down noisy dateutil warning from pycot

# ---------------------- Disk cache ----------------------

def _safe_name(txt: str) -> str:

def _df_cache_path(report_code: str, contract: str) -> pathlib.Path:

def _meta_path(report_code: str) -> pathlib.Path:

def _choices_path(report_code: str) -> pathlib.Path:

def _load_cached_df(report_code: str, contract: str) -> pd.DataFrame | None:

def _save_cached_df(report_code: str, contract: str, df: pd.DataFrame):

def _load_meta(report_code: str) -> dict:

def _save_meta(report_code: str, meta: dict):

def _save_choices(report_code: str, choices: dict):

# ---------------------- Helpers ----------------------

def _find_col(df, must_include: List[str], must_exclude: List[str] | None = None):

def _match_any_synonym(df, keys_any: List[str]):

def _get_net_pct(df, keys_any: List[str]):

# Rich column enumeration (label → Series)

def _norm_group(raw: str, report_code: str) -> str:

def enumerate_cot_columns(df: pd.DataFrame, report_code: str) -> "OrderedDict[str,

# Patterned: Net %OI / Net / Net Change / Long / Short

# Synthesize common %OI if missing

# Any remaining numeric columns (to be safe)

# ---------------------- Transforms & CS aggregations ----------------------

def _range_index(x: pd.Series, win: int) -> pd.Series:

def cs_aggregate(s: pd.Series, agg_code: str, N: int) -> float | None:

# ---------------------- Extra utilities (new) ----------------------

def percentile_rank(values: pd.Series) -> pd.Series:

def decile_bucket(pct: float) -> str:

# ---------------------- Data access (cached + trimmed) ----------------------

def list_contracts(report_type: str) -> List[str]:

def _latest_cftc_date(df: pd.DataFrame) -> pd.Timestamp | None:

def _trim_weeks(df: pd.DataFrame, need_weeks: int) -> pd.DataFrame:

def fetch_cot_df(report_type: str, contract: str, *, need_weeks: int, policy: str="check")

if policy == "check" and cached is not None:

# "clear" OR no cache → fetch

# ---------------------- Wizard I/O ----------------------

def ask_str(prompt: str, default: str = "") -> str:

def ask_yesno(prompt: str, default_yes: bool = True) -> bool:

# ---------------------- Leaderboard compute ----------------------

def _one(name: str):

# optional winsorization on Effective Value (cross-sectional stability)

# keep classic columns + new ones (preserve your original columns)

# ---------------------- TEXT WIZARD ----------------------

# 4) Let the user choose a column (dynamic!)

# 6) Cross-sectional aggregation window & operator

# 7c) Winsorization (optional)

# 8) Self windows (relative-to-itself metrics)

# 9) Download/compute window needed (keep minimal)

# 10) Cache policy & workers

# 11) Excel + series options

# 12) (Optional) limit universe for a quick dry-run

# 13) Optional Excel formatting

# ---------------- RUN ----------------

# Diagnostics sheet info

# Save Excel (with optional formatting)

with pd.ExcelWriter(excel_out, **engine_kwargs) as writer:

# Params, Universe, Columns, Diagnostics

print(f"\nSaved Excel → {excel_out}")

# Auto-run the wizard when executed in a notebook cell

You might also like

from future import annotations