fairvalue-api / app.py
FairValue
feat: production web app — React/Vite frontend + FastAPI backend with Render/Vercel deployment
b72652e
import streamlit as st
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from src.models.explain import generate_shap_explanation
from src.features.build_features import calculate_risk_score
st.set_page_config(page_title="FairValue Transfer Cap Estimator", layout="wide")
# ── Currency Config ───────────────────────────────────────────────────────────
EUR_TO_GBP = 0.85 # Approximate conversion — review quarterly
st.title("FairValue Transfer Cap Estimator")
st.markdown(
"A rigorous, data-driven 'Transfer Ceiling Calculator' grounded in ML "
"and Hedonic Pricing Theory."
)
# ── Data Loading ─────────────────────────────────────────────────────────────
# Fixed: was @st.cache_resource — mutations persisted across user sessions.
# @st.cache_data correctly serialises DataFrames per session.
@st.cache_data
def load_player_data():
"""Loads processed player features CSV. Returns None if file not found."""
df_path = "data/processed/app_features.csv"
if not os.path.exists(df_path):
return None
df = pd.read_csv(df_path)
mv_rename_map = {
col: 'market_value_in_eur'
for col in df.columns
if 'market' in col.lower() and 'value' in col.lower()
}
if mv_rename_map:
df.rename(columns=mv_rename_map, inplace=True)
df = df.loc[:, ~df.columns.duplicated()].copy()
return df
# ── Model Loading ─────────────────────────────────────────────────────────────
# @st.cache_resource is correct here — the model object is shared, not copied.
@st.cache_resource
def load_model():
"""Loads XGBoost model from disk. Returns (model, size_bytes, path)."""
xgb_model = xgb.XGBRegressor()
for path in ["fairvalue_xgboost.json", "FairValue_xgboost.json"]:
if os.path.exists(path):
size = os.path.getsize(path)
xgb_model.load_model(path)
return xgb_model, size, path
return None, 0, None
df = load_player_data()
model, model_size, model_path = load_model()
if df is None:
st.error("⚠️ Data file not found: `data/processed/app_features.csv`. Re-run the pipeline.")
st.stop()
if model is None:
st.error("⚠️ MODEL_FILE_NOT_FOUND — ensure `fairvalue_xgboost.json` is in the project root.")
st.stop()
if model_size < 1000:
st.error(
f"❌ MODEL CORRUPTED: `{model_path}` is only {model_size:,} bytes "
"(expected ~400 KB). Please re-upload the correct model file."
)
st.stop() # Fixed: was missing — execution continued and produced silent zero predictions
# ── Sidebar ───────────────────────────────────────────────────────────────────
st.sidebar.header("Player Transfer Profile")
input_mode = st.sidebar.radio("Input Mode", ["Select Existing Player", "Create Custom Player"])
name_col = next(
(c for c in ['name', 'name_x', 'Player_Name', 'Name'] if c in df.columns), None
)
selected_name = ""
if input_mode == "Select Existing Player":
if name_col is None:
st.error("No player name column found in data. Please re-run the pipeline.")
st.stop()
player_list = sorted(df[name_col].astype(str).unique().tolist())
selected_name = st.sidebar.selectbox("Target Database Player", player_list)
player_data = df[df[name_col].astype(str) == selected_name].iloc[0:1].copy()
contract_years = st.sidebar.slider(
"Contract Years Remaining", 0.5, 6.0,
float(player_data['Contract_Years_Left'].iloc[0]), 0.5
)
age = st.sidebar.slider("Age", 16, 40, int(player_data['Age'].iloc[0]))
inj_col = next(
(c for c in ['Injury_Days_Total_24m', 'Injury_Days'] if c in player_data.columns), None
)
injuries = st.sidebar.number_input(
"Injury missed days (24m)", 0, 500,
int(player_data[inj_col].iloc[0]) if inj_col else 10
)
else:
player_data = df.median(numeric_only=True).to_frame().T
contract_years = st.sidebar.slider("Contract Years Remaining", 0.5, 6.0, 2.0, 0.5)
age = st.sidebar.slider("Age", 16, 40, 24)
injuries = st.sidebar.number_input("Injury missed days (24m)", 0, 500, 10)
if 'market_value_in_eur' in player_data.columns:
m_val = st.sidebar.number_input("Current Market Value Estimation (£m)", 1.0, 200.0, 20.0)
player_data['market_value_in_eur'] = (m_val * 1_000_000) / EUR_TO_GBP
asking_price = st.sidebar.number_input("Selling Club Asking Price (£m)", 1.0, 300.0, 45.0)
# ── Hype Factor Integration (consumed from Page 3 Live Player Intel) ──────────
# Fixed: was never read — hard_cap calculation ignored session_state entirely.
hype_all = st.session_state.get('player_hype_metrics', {})
hype_entry = hype_all.get(selected_name.lower(), {}) if selected_name else {}
hype_premium_pct = hype_entry.get('hype_premium_percent', 0.0)
if hype_premium_pct != 0.0:
sign = "+" if hype_premium_pct > 0 else ""
st.sidebar.info(
f"💡 **Hype Factor Active:** {sign}{hype_premium_pct:.1f}% \n"
"*Run Page 3 → Live Player Intel to refresh.*"
)
else:
st.sidebar.caption("No Hype Factor loaded. Run Page 3 to enrich this estimate.")
# ── Build Inference Vector ─────────────────────────────────────────────────────
expected_cols = model.feature_names_in_
player_data = player_data.copy()
player_data['Contract_Years_Left'] = contract_years
player_data['Age'] = age
if 'Injury_Days_Total_24m' in player_data.columns:
player_data['Injury_Days_Total_24m'] = injuries
# Recompute derived risk flags so they reflect the slider values, not stale DB data.
# Without this, Risk_Contract / Risk_Age / Risk_Injury don't update when sliders move.
player_data = calculate_risk_score(
player_data,
contract_col='Contract_Years_Left',
age_col='Age',
injury_col='Injury_Days_Total_24m'
)
X_infer = player_data.reindex(columns=expected_cols, fill_value=0)
if st.button("Calculate Prediction", type="primary"):
raw_preds = model.predict(X_infer)
log_pv = raw_preds[0]
baseline_pv = max(float(np.expm1(log_pv)), 0.0)
baseline_pv_m = baseline_pv / 1_000_000
conservative_bound = baseline_pv * 0.85
# Internal rule-based risk discount (transparent to users)
risk_pct = (
(0.20 if contract_years < 1.5 else 0.0) +
(0.15 if age > 30 else 0.0) +
(0.10 if injuries > 60 else 0.0)
)
# External hype multiplier from Page 3 NLP sentiment
hype_multiplier = 1.0 + (hype_premium_pct / 100.0)
hard_cap = conservative_bound * (1.0 - risk_pct) * hype_multiplier
hard_cap_m = hard_cap / 1_000_000
col1, col2 = st.columns([2, 1])
with col1:
st.subheader("Price Range Recommendation")
fig = go.Figure(go.Indicator(
mode="gauge+number+delta",
value=asking_price,
delta={'reference': hard_cap_m, 'increasing': {'color': "red"}},
title={'text': "Asking Price vs Hard Cap (£m)"},
gauge={
'axis': {'range': [0, max(asking_price * 1.2, 100)]},
'threshold': {'line': {'color': "white", 'width': 4}, 'value': hard_cap_m}
}
))
st.plotly_chart(fig, use_container_width=True)
with col2:
st.subheader("Metrics Breakdown")
st.metric("Predicted Market Value (Baseline)", f"£{baseline_pv_m:.1f}m")
st.metric("Risk-Adjusted Hard Cap", f"£{hard_cap_m:.1f}m")
if hype_premium_pct != 0.0:
sign = "+" if hype_premium_pct > 0 else ""
st.metric("Hype / Form Adjustment", f"{sign}{hype_premium_pct:.1f}%")
if asking_price > hard_cap_m:
overpay = asking_price - hard_cap_m
st.error(f"⚠️ Winner's Curse Risk: £{overpay:.1f}m Overpay")
else:
st.success("✅ Asking price is within Fair Value bounds.")
# ── SHAP Explainability Panel ─────────────────────────────────────────────
# Fixed: shap was imported but never used. Now wired to generate_shap_explanation.
st.markdown("---")
st.subheader("🔬 XAI Explainability — What Drives This Valuation?")
with st.spinner("Computing SHAP feature contributions..."):
try:
_, explanation_df = generate_shap_explanation(model, X_infer)
top_shap = explanation_df.head(10).copy()
top_shap['Label'] = top_shap['Feature'].str.replace('_', ' ').str.title()
fig_shap = go.Figure(go.Bar(
x=top_shap['Contribution_to_LogPrice'],
y=top_shap['Label'],
orientation='h',
marker_color=[
'#e74c3c' if v < 0 else '#2ecc71'
for v in top_shap['Contribution_to_LogPrice']
],
text=[f"{v:+.3f}" for v in top_shap['Contribution_to_LogPrice']],
textposition='outside',
))
fig_shap.update_layout(
title="Top 10 Feature Contributions to Transfer Fee (Log-Price Scale)",
xaxis_title="SHAP Value (Additive impact on log transfer fee)",
yaxis={'categoryorder': 'total ascending'},
height=420,
margin=dict(l=10, r=10, t=50, b=10),
)
st.plotly_chart(fig_shap, use_container_width=True)
st.caption(
"🟢 Green = boosts transfer value | 🔴 Red = depresses transfer value | "
"Sorted by absolute magnitude."
)
except Exception as shap_err:
st.warning(f"SHAP panel unavailable: {shap_err}")
with st.expander("🛠️ Technical Deep Dive (Internal Metrics)"):
st.write(f"**Model:** `{model_path}` ({model_size:,} bytes)")
st.write(f"**Raw Log-Scale Prediction:** `{log_pv:.4f}`")
mv_val = (
X_infer['market_value_in_eur'].iloc[0]
if 'market_value_in_eur' in X_infer.columns else "MISSING"
)
if isinstance(mv_val, (int, float)):
st.write(f"**Market Value Input (EUR):** `{mv_val:,.0f}`")
else:
st.write(f"**Market Value Input:** `{mv_val}`")
st.write("**Full Feature Vector:**", X_infer)
st.markdown("---")
st.subheader("Model Performance Assessment")
st.markdown(f"**Training Set:** `{len(df):,}` transfers | **Engine:** XGBoost + RandomizedSearchCV")
st.markdown("**Validation MAE:** `~£23,980,000` | **Status:** Production Ready")