eval/empirical_analysis.py · Prediction Market Bot Postmortem

165 lines · python

"""Hermes root-cause analysis: empirical bracket-hit model vs Gaussian.
 
Reads the joined historical dataset, reconstructs (NWS forecast, bracket bounds,
hit/miss, market price), and:
  1. Examines the distance->hit relationship vs what the Gaussian predicts
  2. Builds an empirical P(hit | |forecast - bracket_mid|) lookup (Laplace-smoothed)
  3. Walk-forward backtests empirical vs Gaussian: Brier, WR, EV, total P&L
No external deps (stdlib only).
"""
import csv, math, os, re, statistics
from collections import defaultdict
 
PSV = os.environ.get("HERMES_DATASET", "data/sample_trades.psv")
COLS = ["id","ticker","market_title","nws_forecast","side","edge","ensemble_probability",
        "pnl","opened_at","cost","count","avg_price","nws_probability","market_price",
        "grok_probability","actual_outcome","correct","market_type"]
 
BRACKET_RE = re.compile(r"be\s+\**\s*(-?\d+)\s*-\s*(-?\d+)\s*°")
DEFAULT_MAE_FALLBACK = 3.0
 
def f(x):
    try: return float(x)
    except (TypeError, ValueError): return None
 
def load():
    rows = []
    with open(PSV) as fh:
        for line in fh:
            parts = line.rstrip("\n").split("|")
            if len(parts) != len(COLS):
                continue
            r = dict(zip(COLS, parts))
            m = BRACKET_RE.search(r["market_title"])
            if not m:
                continue
            lo, hi = int(m.group(1)), int(m.group(2))
            if lo > hi: lo, hi = hi, lo
            r["lo"], r["hi"] = lo, hi
            r["mid"] = (lo + hi) / 2.0
            r["width"] = hi - lo
            r["nws"] = f(r["nws_forecast"])
            r["hit"] = 1 if r["actual_outcome"].strip().lower() == "yes" else 0
            r["mkt_price"] = f(r["avg_price"])
            r["mkt_p_hit"] = (1.0 - r["mkt_price"]) if r["mkt_price"] is not None else None
            r["gauss_p_hit"] = f(r["nws_probability"])
            r["pnl"] = f(r["pnl"]); r["cost"] = f(r["cost"]); r["cnt"] = f(r["count"])
            rows.append(r)
    return rows
 
def gaussian_p_hit(nws, lo, hi, mae=DEFAULT_MAE_FALLBACK):
    """Reproduce nws_implied_probability bracket branch."""
    sigma = mae * math.sqrt(math.pi / 2.0)
    z_lo = (lo - nws) / sigma
    z_hi = (hi - nws) / sigma
    cdf = lambda z: 0.5 * (1.0 + math.erf(z / math.sqrt(2)))
    return max(0.0, cdf(z_hi) - cdf(z_lo))
 
def absd_bin(ad):
 
    for edge in (0.5, 1.5, 2.5, 3.5, 5.0, 8.0):
        if ad < edge: return edge
    return 99.0
 
def build_empirical(train):
    """P(hit | |d| bin) with Laplace smoothing toward the global base rate."""
    by_bin = defaultdict(lambda: [0,0])
    g_hits = g_n = 0
    for r in train:
        if r["nws"] is None: continue
        b = absd_bin(abs(r["nws"] - r["mid"]))
        by_bin[b][0] += r["hit"]; by_bin[b][1] += 1
        g_hits += r["hit"]; g_n += 1
    base = (g_hits / g_n) if g_n else 0.15
    ALPHA = 4.0
    table = {}
    for b,(h,n) in by_bin.items():
        table[b] = (h + ALPHA*base) / (n + ALPHA)
    return table, base
 
def emp_p_hit(table, base, nws, mid):
    if nws is None: return base
    return table.get(absd_bin(abs(nws - mid)), base)
 
def brier(pred, outcome):
    return (pred - outcome) ** 2
 
def walk_forward(rows, min_edge):
    """Chronological. For trade i, train empirical on 0..i-1 only.
    Simulate: bet NO if p_hit <= mkt_p_hit - min_edge; YES if p_hit >= mkt_p_hit + min_edge."""
    res = {"gauss": dict(n=0,wins=0,pnl=0.0,brier=[]),
           "emp":   dict(n=0,wins=0,pnl=0.0,brier=[]),
           "actual_all_brier_g":[], "actual_all_brier_e":[]}
    usable = [r for r in rows if r["nws"] is not None and r["mkt_p_hit"] is not None
              and r["pnl"] is not None and r["cnt"]]
    for i, r in enumerate(usable):
        train = usable[:i]
        if len(train) < 15:
            continue
        table, base = build_empirical(train)
        pe = emp_p_hit(table, base, r["nws"], r["mid"])
        pg = gaussian_p_hit(r["nws"], r["lo"], r["hi"])
        mp = r["mkt_p_hit"]
 
        res["actual_all_brier_g"].append(brier(pg, r["hit"]))
        res["actual_all_brier_e"].append(brier(pe, r["hit"]))
        per_contract_no_win = (1.0 - r["mkt_price"])
        for tag, p in (("gauss", pg), ("emp", pe)):
            d = res[tag]
            if p <= mp - min_edge:
                d["n"] += 1
                won = (r["hit"] == 0)
                d["pnl"] += (r["cnt"]*per_contract_no_win) if won else (-r["cost"])
                d["wins"] += 1 if won else 0
                d["brier"].append(brier(p, r["hit"]))
            elif p >= mp + min_edge:
                d["n"] += 1
                won = (r["hit"] == 1)
                yes_price = 1.0 - r["mkt_price"]
                d["pnl"] += (r["cnt"]*(1.0-yes_price)) if won else (-r["cnt"]*yes_price)
                d["wins"] += 1 if won else 0
                d["brier"].append(brier(p, r["hit"]))
    return res, usable
 
def main():
    rows = load()
    print(f"Loaded {len(rows)} bracket rows; "
          f"{sum(1 for r in rows if r['nws'] is not None)} have NWS forecast.\n")
 
    print("=== Empirical P(hit) by |NWS - bracket_mid|, vs Gaussian prediction ===")
    buckets = defaultdict(lambda:[0,0,[]])
    for r in rows:
        if r["nws"] is None: continue
        b = absd_bin(abs(r["nws"]-r["mid"]))
        buckets[b][0]+=r["hit"]; buckets[b][1]+=1
        buckets[b][2].append(gaussian_p_hit(r["nws"],r["lo"],r["hi"]))
    print(f"{'|d|<':>6} {'n':>4} {'hits':>4} {'emp_P(hit)':>10} {'gauss_P(hit)':>12}  gap")
    for b in sorted(buckets):
        h,n,gs = buckets[b]
        emp = h/n if n else 0
        g = statistics.mean(gs) if gs else 0
        print(f"{b:>6} {n:>4} {h:>4} {emp:>10.3f} {g:>12.3f}  {emp-g:+.3f}")
 
    bg = [brier(gaussian_p_hit(r["nws"],r["lo"],r["hi"]), r["hit"])
          for r in rows if r["nws"] is not None]
    print(f"\nGaussian Brier (all {len(bg)} w/ NWS): {statistics.mean(bg):.4f}  "
          f"(0.25=coinflip, lower=better)")
 
    for thr in (0.45, 0.35, 0.25, 0.15):
        res, usable = walk_forward(rows, thr)
        print(f"\n=== WALK-FORWARD @ min_edge={thr}  (usable n={len(usable)}, "
              f"warmup 15) ===")
        for tag in ("gauss","emp"):
            d = res[tag]
            n = d["n"]; wr = (d["wins"]/n*100) if n else 0
            ev = (d["pnl"]/n) if n else 0
            bm = statistics.mean(d["brier"]) if d["brier"] else float("nan")
            print(f"  {tag:5s}: trades={n:3d} WR={wr:5.1f}% "
                  f"P&L=${d['pnl']:+7.2f} EV/trade=${ev:+.3f} Brier(traded)={bm:.4f}")
        if res["actual_all_brier_g"]:
            print(f"  calibration Brier on ALL resolved (walk-fwd): "
                  f"gauss={statistics.mean(res['actual_all_brier_g']):.4f}  "
                  f"emp={statistics.mean(res['actual_all_brier_e']):.4f}")
 
if __name__ == "__main__":
    main()

1	"""Hermes root-cause analysis: empirical bracket-hit model vs Gaussian.
2
3	Reads the joined historical dataset, reconstructs (NWS forecast, bracket bounds,
4	hit/miss, market price), and:
5	1. Examines the distance->hit relationship vs what the Gaussian predicts
6	2. Builds an empirical P(hit \| \|forecast - bracket_mid\|) lookup (Laplace-smoothed)
7	3. Walk-forward backtests empirical vs Gaussian: Brier, WR, EV, total P&L
8	No external deps (stdlib only).
9	"""
10	import csv, math, os, re, statistics
11	from collections import defaultdict
12
13	PSV = os.environ.get("HERMES_DATASET", "data/sample_trades.psv")
14	COLS = ["id","ticker","market_title","nws_forecast","side","edge","ensemble_probability",
15	"pnl","opened_at","cost","count","avg_price","nws_probability","market_price",
16	"grok_probability","actual_outcome","correct","market_type"]
17
18	BRACKET_RE = re.compile(r"be\s+\*\s(-?\d+)\s-\s(-?\d+)\s*°")
19	DEFAULT_MAE_FALLBACK = 3.0
20
21	def f(x):
22	try: return float(x)
23	except (TypeError, ValueError): return None
24
25	def load():
26	rows = []
27	with open(PSV) as fh:
28	for line in fh:
29	parts = line.rstrip("\n").split("\|")
30	if len(parts) != len(COLS):
31	continue
32	r = dict(zip(COLS, parts))
33	m = BRACKET_RE.search(r["market_title"])
34	if not m:
35	continue
36	lo, hi = int(m.group(1)), int(m.group(2))
37	if lo > hi: lo, hi = hi, lo
38	r["lo"], r["hi"] = lo, hi
39	r["mid"] = (lo + hi) / 2.0
40	r["width"] = hi - lo
41	r["nws"] = f(r["nws_forecast"])
42	r["hit"] = 1 if r["actual_outcome"].strip().lower() == "yes" else 0
43	r["mkt_price"] = f(r["avg_price"])
44	r["mkt_p_hit"] = (1.0 - r["mkt_price"]) if r["mkt_price"] is not None else None
45	r["gauss_p_hit"] = f(r["nws_probability"])
46	r["pnl"] = f(r["pnl"]); r["cost"] = f(r["cost"]); r["cnt"] = f(r["count"])
47	rows.append(r)
48	return rows
49
50	def gaussian_p_hit(nws, lo, hi, mae=DEFAULT_MAE_FALLBACK):
51	"""Reproduce nws_implied_probability bracket branch."""
52	sigma = mae * math.sqrt(math.pi / 2.0)
53	z_lo = (lo - nws) / sigma
54	z_hi = (hi - nws) / sigma
55	cdf = lambda z: 0.5 * (1.0 + math.erf(z / math.sqrt(2)))
56	return max(0.0, cdf(z_hi) - cdf(z_lo))
57
58	def absd_bin(ad):
59
60	for edge in (0.5, 1.5, 2.5, 3.5, 5.0, 8.0):
61	if ad < edge: return edge
62	return 99.0
63
64	def build_empirical(train):
65	"""P(hit \| \|d\| bin) with Laplace smoothing toward the global base rate."""
66	by_bin = defaultdict(lambda: [0,0])
67	g_hits = g_n = 0
68	for r in train:
69	if r["nws"] is None: continue
70	b = absd_bin(abs(r["nws"] - r["mid"]))
71	by_bin[b][0] += r["hit"]; by_bin[b][1] += 1
72	g_hits += r["hit"]; g_n += 1
73	base = (g_hits / g_n) if g_n else 0.15
74	ALPHA = 4.0
75	table = {}
76	for b,(h,n) in by_bin.items():
77	table[b] = (h + ALPHA*base) / (n + ALPHA)
78	return table, base
79
80	def emp_p_hit(table, base, nws, mid):
81	if nws is None: return base
82	return table.get(absd_bin(abs(nws - mid)), base)
83
84	def brier(pred, outcome):
85	return (pred - outcome) ** 2
86
87	def walk_forward(rows, min_edge):
88	"""Chronological. For trade i, train empirical on 0..i-1 only.
89	Simulate: bet NO if p_hit <= mkt_p_hit - min_edge; YES if p_hit >= mkt_p_hit + min_edge."""
90	res = {"gauss": dict(n=0,wins=0,pnl=0.0,brier=[]),
91	"emp": dict(n=0,wins=0,pnl=0.0,brier=[]),
92	"actual_all_brier_g":[], "actual_all_brier_e":[]}
93	usable = [r for r in rows if r["nws"] is not None and r["mkt_p_hit"] is not None
94	and r["pnl"] is not None and r["cnt"]]
95	for i, r in enumerate(usable):
96	train = usable[:i]
97	if len(train) < 15:
98	continue
99	table, base = build_empirical(train)
100	pe = emp_p_hit(table, base, r["nws"], r["mid"])
101	pg = gaussian_p_hit(r["nws"], r["lo"], r["hi"])
102	mp = r["mkt_p_hit"]
103
104	res["actual_all_brier_g"].append(brier(pg, r["hit"]))
105	res["actual_all_brier_e"].append(brier(pe, r["hit"]))
106	per_contract_no_win = (1.0 - r["mkt_price"])
107	for tag, p in (("gauss", pg), ("emp", pe)):
108	d = res[tag]
109	if p <= mp - min_edge:
110	d["n"] += 1
111	won = (r["hit"] == 0)
112	d["pnl"] += (r["cnt"]*per_contract_no_win) if won else (-r["cost"])
113	d["wins"] += 1 if won else 0
114	d["brier"].append(brier(p, r["hit"]))
115	elif p >= mp + min_edge:
116	d["n"] += 1
117	won = (r["hit"] == 1)
118	yes_price = 1.0 - r["mkt_price"]
119	d["pnl"] += (r["cnt"](1.0-yes_price)) if won else (-r["cnt"]yes_price)
120	d["wins"] += 1 if won else 0
121	d["brier"].append(brier(p, r["hit"]))
122	return res, usable
123
124	def main():
125	rows = load()
126	print(f"Loaded {len(rows)} bracket rows; "
127	f"{sum(1 for r in rows if r['nws'] is not None)} have NWS forecast.\n")
128
129	print("=== Empirical P(hit) by \|NWS - bracket_mid\|, vs Gaussian prediction ===")
130	buckets = defaultdict(lambda:[0,0,[]])
131	for r in rows:
132	if r["nws"] is None: continue
133	b = absd_bin(abs(r["nws"]-r["mid"]))
134	buckets[b][0]+=r["hit"]; buckets[b][1]+=1
135	buckets[b][2].append(gaussian_p_hit(r["nws"],r["lo"],r["hi"]))
136	print(f"{'\|d\|<':>6} {'n':>4} {'hits':>4} {'emp_P(hit)':>10} {'gauss_P(hit)':>12} gap")
137	for b in sorted(buckets):
138	h,n,gs = buckets[b]
139	emp = h/n if n else 0
140	g = statistics.mean(gs) if gs else 0
141	print(f"{b:>6} {n:>4} {h:>4} {emp:>10.3f} {g:>12.3f} {emp-g:+.3f}")
142
143	bg = [brier(gaussian_p_hit(r["nws"],r["lo"],r["hi"]), r["hit"])
144	for r in rows if r["nws"] is not None]
145	print(f"\nGaussian Brier (all {len(bg)} w/ NWS): {statistics.mean(bg):.4f} "
146	f"(0.25=coinflip, lower=better)")
147
148	for thr in (0.45, 0.35, 0.25, 0.15):
149	res, usable = walk_forward(rows, thr)
150	print(f"\n=== WALK-FORWARD @ min_edge={thr} (usable n={len(usable)}, "
151	f"warmup 15) ===")
152	for tag in ("gauss","emp"):
153	d = res[tag]
154	n = d["n"]; wr = (d["wins"]/n*100) if n else 0
155	ev = (d["pnl"]/n) if n else 0
156	bm = statistics.mean(d["brier"]) if d["brier"] else float("nan")
157	print(f" {tag:5s}: trades={n:3d} WR={wr:5.1f}% "
158	f"P&L=${d['pnl']:+7.2f} EV/trade=${ev:+.3f} Brier(traded)={bm:.4f}")
159	if res["actual_all_brier_g"]:
160	print(f" calibration Brier on ALL resolved (walk-fwd): "
161	f"gauss={statistics.mean(res['actual_all_brier_g']):.4f} "
162	f"emp={statistics.mean(res['actual_all_brier_e']):.4f}")
163
164	if __name__ == "__main__":
165	main()