eval/c4_eval.py · Prediction Market Bot Postmortem

189 lines · python

"""Hermes C4 - shadow-pivot evaluation (unattended).
 
Runs on CT-REDACTED via system cron (no Claude session needed). Pulls SHADOW
predictions from hermes.db, backfills outcomes from the Kalshi API, applies a
hard liquidity filter, and scores the pivot against the pre-committed decision
gate. Writes a report + Discord ping. Idempotent; safe to re-run.
 
Decision gate (from PIVOT_SPEC.md): propose a small live pilot ONLY if, on the
liquid subset with >=30 resolved:
    EV/trade after fees > 0  AND  Brier < 0.25  AND  it holds in the
    highest-volume city/market-type cell (not one lucky cluster).
Otherwise: iterate SHADOW_GAP_K / SHADOW_MIN_BRACKET_WIDTH, or retire.
NEVER enables live trading. Never writes auto_config.
"""
import sys, os, json, sqlite3, statistics, datetime, traceback
 
DB = "$HERMES_HOME/hermes.db"
REPORT_DIR = "$HERMES_HOME"
MIN_PRICE = 0.10
MIN_VOLUME = 20
MIN_RESOLVED = 30
BRIER_GATE = 0.25
sys.path.insert(0, "$HERMES_HOME")
 
def log(*a):
    print(*a, flush=True)
 
def backfill_outcome(main, ticker):
    """Return 1 if YES settled, 0 if NO, None if still open/unknown."""
    try:
        m = main.kalshi_get(f"/markets/{ticker}")
        mk = (m or {}).get("market") or {}
        res = (mk.get("result") or "").lower()
        if res == "yes":
            return 1
        if res == "no":
            return 0
        return None
    except Exception:
        return None
 
def main_eval():
    ts = datetime.datetime.now().strftime("%Y%m%d-%H%M")
    report_path = os.path.join(REPORT_DIR, f"c4_report_{ts}.txt")
    out = []
 
    def emit(s=""):
        out.append(s); log(s)
 
    emit(f"=== Hermes C4 evaluation - {datetime.datetime.now().isoformat(timespec='seconds')} ===")
 
    try:
        import main
    except Exception as e:
        emit(f"FATAL: cannot import main.py: {e}")
        _write_and_notify(report_path, out, None)
        return
 
    con = sqlite3.connect(DB)
    con.row_factory = sqlite3.Row
    rows = con.execute(
        """SELECT p.id, p.ticker, p.market_title, p.ensemble_probability ep,
                  p.market_price px, p.edge, p.recommendation rec, p.market_type,
                  p.actual_outcome, p.predicted_at,
                  COALESCE(mh.volume_fp, 0) vol
           FROM predictions p
           LEFT JOIN market_history mh ON mh.ticker = p.ticker
           WHERE p.recommendation LIKE 'SHADOW%'
           ORDER BY p.id"""
    ).fetchall()
    emit(f"shadow predictions on record: {len(rows)}")
    if not rows:
        emit("No shadow predictions yet - collection has not produced data. "
             "Recommend: verify SHADOW_MODE scanner is running; re-check in ~1 week.")
        _write_and_notify(report_path, out, "INSUFFICIENT")
        con.close()
        return
 
    liquid = [r for r in rows if (r["px"] or 0) >= MIN_PRICE and (r["vol"] or 0) >= MIN_VOLUME]
    emit(f"after liquidity filter (px>=${MIN_PRICE:.2f} & vol>={MIN_VOLUME}): {len(liquid)}")
 
    resolved = []
    for r in liquid:
        ao = r["actual_outcome"]
        outcome = None
        if ao in (0, 1):
            outcome = int(ao)
        elif isinstance(ao, str) and ao.strip().lower() in ("yes", "no"):
            outcome = 1 if ao.strip().lower() == "yes" else 0
        else:
            outcome = backfill_outcome(main, r["ticker"])
        if outcome is not None:
            resolved.append((r, outcome))
    emit(f"resolved (liquid): {len(resolved)} / need {MIN_RESOLVED}")
 
    if len(resolved) < MIN_RESOLVED:
        emit("")
        emit(f"VERDICT: INSUFFICIENT DATA - {len(resolved)} resolved liquid predictions "
             f"(< {MIN_RESOLVED}). Do NOT evaluate edge yet. Keep SHADOW_MODE running; "
             f"re-run this eval in ~7 days. (Total shadow rows {len(rows)}, "
             f"liquid {len(liquid)} - if liquid stays tiny, the pivot's market set "
             f"may be structurally illiquid → that itself is a finding.)")
        _write_and_notify(report_path, out, "INSUFFICIENT")
        con.close()
        return
 
    briers, wins, evs = [], 0, []
    cell = {}
    for r, outcome in resolved:
        ep = r["ep"] if r["ep"] is not None else 0.5
        briers.append((ep - outcome) ** 2)
        side_yes = "YES" in (r["rec"] or "").upper()
        px = r["px"] or 0.0
        try:
            fee = main.kalshi_taker_fee(px)
        except Exception:
            fee = 0.0
        won = (outcome == 1) if side_yes else (outcome == 0)
        ev = ((1.0 - px) - fee) if won else (-(px + fee))
        evs.append(ev)
        wins += 1 if won else 0
        city = r["ticker"].split("-")[0]
        key = (city, r["market_type"] or "?")
        c = cell.setdefault(key, [0, 0, 0.0])
        c[0] += 1; c[1] += 1 if won else 0; c[2] += ev
 
    n = len(resolved)
    brier = statistics.mean(briers)
    wr = wins / n
    ev_mean = statistics.mean(evs)
    emit("")
    emit(f"n={n}  WR={wr*100:.1f}%  Brier={brier:.4f}  EV/trade=${ev_mean:+.3f} (after fees)")
    emit("by cell (city, market_type):")
    best_cell = None
    for key, (cn, cw, cev) in sorted(cell.items(), key=lambda kv: -kv[1][0]):
        cev_avg = cev / cn if cn else 0
        emit(f"  {key[0]:14s} {key[1]:6s}  n={cn:3d}  WR={cw/cn*100:4.0f}%  EV=${cev_avg:+.3f}")
        if best_cell is None:
            best_cell = (key, cn, cw / cn, cev_avg)
 
    cell_ok = bool(best_cell and best_cell[1] >= 10 and best_cell[3] > 0)
    passed = (ev_mean > 0) and (brier < BRIER_GATE) and cell_ok
    emit("")
    if passed:
        emit("VERDICT: GATE PASSED - pivot shows positive EV after fees, Brier "
             f"< {BRIER_GATE}, and holds in the highest-volume cell "
             f"{best_cell[0]} (n={best_cell[1]}, EV=${best_cell[3]:+.3f}). "
             "RECOMMEND: propose a SMALL live pilot to the user. Do NOT auto-enable.")
    else:
        why = []
        if ev_mean <= 0: why.append(f"EV/trade ${ev_mean:+.3f} not > 0")
        if brier >= BRIER_GATE: why.append(f"Brier {brier:.3f} not < {BRIER_GATE}")
        if not cell_ok: why.append("does not hold in the highest-volume cell")
        emit("VERDICT: GATE FAILED - " + "; ".join(why) + ". "
             "RECOMMEND: iterate SHADOW_GAP_K / SHADOW_MIN_BRACKET_WIDTH, or retire. "
             "Do NOT enable live trading.")
    emit("")
    emit("(Auto-trading untouched. This script never writes auto_config.)")
    con.close()
    _write_and_notify(report_path, out, "PASS" if passed else "FAIL")
 
def _write_and_notify(path, lines, status):
    body = "\n".join(lines)
    try:
        with open(path, "w") as f:
            f.write(body + "\n")
    except Exception:
        pass
 
    try:
        import main
        hook = getattr(main, "DISCORD_WEBHOOK", None) or os.getenv("DISCORD_WEBHOOK")
        if hook:
            import urllib.request
            tag = {"PASS": "✅", "FAIL": "❌", "INSUFFICIENT": "⏳"}.get(status, "ℹ️")
            msg = f"{tag} **Hermes C4 eval** ({status})\n```\n{body[-1500:]}\n```"
            req = urllib.request.Request(
                hook, data=json.dumps({"content": msg}).encode(),
                headers={"Content-Type": "application/json"})
            urllib.request.urlopen(req, timeout=10)
    except Exception:
        pass
 
if __name__ == "__main__":
    try:
        main_eval()
    except Exception:
        traceback.print_exc()
        sys.exit(1)

1	"""Hermes C4 - shadow-pivot evaluation (unattended).
2
3	Runs on CT-REDACTED via system cron (no Claude session needed). Pulls SHADOW
4	predictions from hermes.db, backfills outcomes from the Kalshi API, applies a
5	hard liquidity filter, and scores the pivot against the pre-committed decision
6	gate. Writes a report + Discord ping. Idempotent; safe to re-run.
7
8	Decision gate (from PIVOT_SPEC.md): propose a small live pilot ONLY if, on the
9	liquid subset with >=30 resolved:
10	EV/trade after fees > 0 AND Brier < 0.25 AND it holds in the
11	highest-volume city/market-type cell (not one lucky cluster).
12	Otherwise: iterate SHADOW_GAP_K / SHADOW_MIN_BRACKET_WIDTH, or retire.
13	NEVER enables live trading. Never writes auto_config.
14	"""
15	import sys, os, json, sqlite3, statistics, datetime, traceback
16
17	DB = "$HERMES_HOME/hermes.db"
18	REPORT_DIR = "$HERMES_HOME"
19	MIN_PRICE = 0.10
20	MIN_VOLUME = 20
21	MIN_RESOLVED = 30
22	BRIER_GATE = 0.25
23	sys.path.insert(0, "$HERMES_HOME")
24
25	def log(*a):
26	print(*a, flush=True)
27
28	def backfill_outcome(main, ticker):
29	"""Return 1 if YES settled, 0 if NO, None if still open/unknown."""
30	try:
31	m = main.kalshi_get(f"/markets/{ticker}")
32	mk = (m or {}).get("market") or {}
33	res = (mk.get("result") or "").lower()
34	if res == "yes":
35	return 1
36	if res == "no":
37	return 0
38	return None
39	except Exception:
40	return None
41
42	def main_eval():
43	ts = datetime.datetime.now().strftime("%Y%m%d-%H%M")
44	report_path = os.path.join(REPORT_DIR, f"c4_report_{ts}.txt")
45	out = []
46
47	def emit(s=""):
48	out.append(s); log(s)
49
50	emit(f"=== Hermes C4 evaluation - {datetime.datetime.now().isoformat(timespec='seconds')} ===")
51
52	try:
53	import main
54	except Exception as e:
55	emit(f"FATAL: cannot import main.py: {e}")
56	_write_and_notify(report_path, out, None)
57	return
58
59	con = sqlite3.connect(DB)
60	con.row_factory = sqlite3.Row
61	rows = con.execute(
62	"""SELECT p.id, p.ticker, p.market_title, p.ensemble_probability ep,
63	p.market_price px, p.edge, p.recommendation rec, p.market_type,
64	p.actual_outcome, p.predicted_at,
65	COALESCE(mh.volume_fp, 0) vol
66	FROM predictions p
67	LEFT JOIN market_history mh ON mh.ticker = p.ticker
68	WHERE p.recommendation LIKE 'SHADOW%'
69	ORDER BY p.id"""
70	).fetchall()
71	emit(f"shadow predictions on record: {len(rows)}")
72	if not rows:
73	emit("No shadow predictions yet - collection has not produced data. "
74	"Recommend: verify SHADOW_MODE scanner is running; re-check in ~1 week.")
75	_write_and_notify(report_path, out, "INSUFFICIENT")
76	con.close()
77	return
78
79	liquid = [r for r in rows if (r["px"] or 0) >= MIN_PRICE and (r["vol"] or 0) >= MIN_VOLUME]
80	emit(f"after liquidity filter (px>=${MIN_PRICE:.2f} & vol>={MIN_VOLUME}): {len(liquid)}")
81
82	resolved = []
83	for r in liquid:
84	ao = r["actual_outcome"]
85	outcome = None
86	if ao in (0, 1):
87	outcome = int(ao)
88	elif isinstance(ao, str) and ao.strip().lower() in ("yes", "no"):
89	outcome = 1 if ao.strip().lower() == "yes" else 0
90	else:
91	outcome = backfill_outcome(main, r["ticker"])
92	if outcome is not None:
93	resolved.append((r, outcome))
94	emit(f"resolved (liquid): {len(resolved)} / need {MIN_RESOLVED}")
95
96	if len(resolved) < MIN_RESOLVED:
97	emit("")
98	emit(f"VERDICT: INSUFFICIENT DATA - {len(resolved)} resolved liquid predictions "
99	f"(< {MIN_RESOLVED}). Do NOT evaluate edge yet. Keep SHADOW_MODE running; "
100	f"re-run this eval in ~7 days. (Total shadow rows {len(rows)}, "
101	f"liquid {len(liquid)} - if liquid stays tiny, the pivot's market set "
102	f"may be structurally illiquid → that itself is a finding.)")
103	_write_and_notify(report_path, out, "INSUFFICIENT")
104	con.close()
105	return
106
107	briers, wins, evs = [], 0, []
108	cell = {}
109	for r, outcome in resolved:
110	ep = r["ep"] if r["ep"] is not None else 0.5
111	briers.append((ep - outcome) ** 2)
112	side_yes = "YES" in (r["rec"] or "").upper()
113	px = r["px"] or 0.0
114	try:
115	fee = main.kalshi_taker_fee(px)
116	except Exception:
117	fee = 0.0
118	won = (outcome == 1) if side_yes else (outcome == 0)
119	ev = ((1.0 - px) - fee) if won else (-(px + fee))
120	evs.append(ev)
121	wins += 1 if won else 0
122	city = r["ticker"].split("-")[0]
123	key = (city, r["market_type"] or "?")
124	c = cell.setdefault(key, [0, 0, 0.0])
125	c[0] += 1; c[1] += 1 if won else 0; c[2] += ev
126
127	n = len(resolved)
128	brier = statistics.mean(briers)
129	wr = wins / n
130	ev_mean = statistics.mean(evs)
131	emit("")
132	emit(f"n={n} WR={wr*100:.1f}% Brier={brier:.4f} EV/trade=${ev_mean:+.3f} (after fees)")
133	emit("by cell (city, market_type):")
134	best_cell = None
135	for key, (cn, cw, cev) in sorted(cell.items(), key=lambda kv: -kv[1][0]):
136	cev_avg = cev / cn if cn else 0
137	emit(f" {key[0]:14s} {key[1]:6s} n={cn:3d} WR={cw/cn*100:4.0f}% EV=${cev_avg:+.3f}")
138	if best_cell is None:
139	best_cell = (key, cn, cw / cn, cev_avg)
140
141	cell_ok = bool(best_cell and best_cell[1] >= 10 and best_cell[3] > 0)
142	passed = (ev_mean > 0) and (brier < BRIER_GATE) and cell_ok
143	emit("")
144	if passed:
145	emit("VERDICT: GATE PASSED - pivot shows positive EV after fees, Brier "
146	f"< {BRIER_GATE}, and holds in the highest-volume cell "
147	f"{best_cell[0]} (n={best_cell[1]}, EV=${best_cell[3]:+.3f}). "
148	"RECOMMEND: propose a SMALL live pilot to the user. Do NOT auto-enable.")
149	else:
150	why = []
151	if ev_mean <= 0: why.append(f"EV/trade ${ev_mean:+.3f} not > 0")
152	if brier >= BRIER_GATE: why.append(f"Brier {brier:.3f} not < {BRIER_GATE}")
153	if not cell_ok: why.append("does not hold in the highest-volume cell")
154	emit("VERDICT: GATE FAILED - " + "; ".join(why) + ". "
155	"RECOMMEND: iterate SHADOW_GAP_K / SHADOW_MIN_BRACKET_WIDTH, or retire. "
156	"Do NOT enable live trading.")
157	emit("")
158	emit("(Auto-trading untouched. This script never writes auto_config.)")
159	con.close()
160	_write_and_notify(report_path, out, "PASS" if passed else "FAIL")
161
162	def _write_and_notify(path, lines, status):
163	body = "\n".join(lines)
164	try:
165	with open(path, "w") as f:
166	f.write(body + "\n")
167	except Exception:
168	pass
169
170	try:
171	import main
172	hook = getattr(main, "DISCORD_WEBHOOK", None) or os.getenv("DISCORD_WEBHOOK")
173	if hook:
174	import urllib.request
175	tag = {"PASS": "✅", "FAIL": "❌", "INSUFFICIENT": "⏳"}.get(status, "ℹ️")
176	msg = f"{tag} Hermes C4 eval ({status})\n```\n{body[-1500:]}\n```"
177	req = urllib.request.Request(
178	hook, data=json.dumps({"content": msg}).encode(),
179	headers={"Content-Type": "application/json"})
180	urllib.request.urlopen(req, timeout=10)
181	except Exception:
182	pass
183
184	if __name__ == "__main__":
185	try:
186	main_eval()
187	except Exception:
188	traceback.print_exc()
189	sys.exit(1)