moPPIt / classifier_code /half_life.py
AlienChen's picture
Update classifier_code/half_life.py
f52b5d3 verified
Raw
History Blame Contribute Delete
7.32 kB
import os
from typing import List, Optional, Union
import numpy as np
import torch
import torch.nn as nn
from transformers import EsmModel, AutoTokenizer
# -----------------------------
# Model definition (must match training)
# -----------------------------
class TransformerRegressor(nn.Module):
def __init__(self, in_dim, d_model=256, nhead=8, layers=2, ff=512, dropout=0.1):
super().__init__()
self.proj = nn.Linear(in_dim, d_model)
enc_layer = nn.TransformerEncoderLayer(
d_model=d_model,
nhead=nhead,
dim_feedforward=ff,
dropout=dropout,
batch_first=True,
activation="gelu",
)
self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers)
self.head = nn.Linear(d_model, 1)
def forward(self, X, M):
# M: True = keep token, False = padding
pad_mask = ~M
Z = self.proj(X)
Z = self.enc(Z, src_key_padding_mask=pad_mask)
Mf = M.unsqueeze(-1).float()
denom = Mf.sum(dim=1).clamp(min=1.0)
pooled = (Z * Mf).sum(dim=1) / denom
return self.head(pooled).squeeze(-1)
def build_model(model_name: str, in_dim: int, params: dict) -> nn.Module:
if model_name != "transformer":
raise ValueError(f"This inference file currently supports model_name='transformer', got: {model_name}")
return TransformerRegressor(
in_dim=in_dim,
d_model=384,
nhead=4,
layers=1,
ff=512,
dropout=0.1521676463658988,
)
def _clean_state_dict(state_dict: dict) -> dict:
cleaned = {}
for k, v in state_dict.items():
if k.startswith("module."):
k = k[len("module.") :]
if k.startswith("model."):
k = k[len("model.") :]
cleaned[k] = v
return cleaned
# -----------------------------
# Predictor
# -----------------------------
class HalflifeTransformer:
def __init__(
self,
ckpt_path: str = "/scratch/pranamlab/tong/PeptiVerse/src/halflife/FINETUNED_TRANSFORMER_DIR/final_model.pt",
esm_name: str = "facebook/esm2_t33_650M_UR50D",
device: Optional[str] = None,
model_name: str = "transformer",
):
self.device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
ckpt = torch.load(ckpt_path, map_location="cpu")
if not isinstance(ckpt, dict) or "state_dict" not in ckpt:
raise ValueError(f"Checkpoint at {ckpt_path} is not the expected dict with a 'state_dict' key.")
self.best_params = ckpt.get("best_params", {})
self.in_dim = int(ckpt.get("in_dim"))
self.target_col = ckpt.get("target_col", "label") # 'log_label' or 'label'
self.model_name = model_name
# --- build + load regressor ---
self.regressor = build_model(model_name=self.model_name, in_dim=self.in_dim, params=self.best_params)
self.regressor.load_state_dict(_clean_state_dict(ckpt["state_dict"]), strict=True)
self.regressor.to(self.device)
self.regressor.eval()
# --- ESM2 embedding model ---
self.emb_model = EsmModel.from_pretrained(esm_name).to(self.device)
self.emb_model.eval()
self.tokenizer = AutoTokenizer.from_pretrained(esm_name)
# sanity: ESM2 hidden size should match training in_dim
esm_hidden = int(self.emb_model.config.hidden_size)
if esm_hidden != self.in_dim:
raise ValueError(
f"Mismatch: ESM hidden_size={esm_hidden}, but checkpoint in_dim={self.in_dim}.\n"
f"Did you train on a different embedding model/dimension than {esm_name}?"
)
@torch.no_grad()
def _embed_unpooled_batch(
self,
sequences: List[str],
max_length: int = 1024,
):
"""
Returns:
X: (B, Lmax, H) float32
M: (B, Lmax) bool, True for real residues, False for padding
"""
if len(sequences) == 0:
X = torch.zeros((0, 1, self.in_dim), dtype=torch.float32, device=self.device)
M = torch.zeros((0, 1), dtype=torch.bool, device=self.device)
return X, M
toks = self.tokenizer(
sequences,
return_tensors="pt",
padding=True,
truncation=True,
max_length=max_length,
add_special_tokens=True,
)
toks = {k: v.to(self.device) for k, v in toks.items()}
out = self.emb_model(**toks)
hs = out.last_hidden_state # (B, T, H)
attn = toks["attention_mask"].bool() # (B, T)
per_seq = []
lengths = []
for i in range(hs.shape[0]):
valid_idx = torch.nonzero(attn[i], as_tuple=False).squeeze(-1)
# ESM typically has <cls> ... tokens ... <eos> among valid positions
if valid_idx.numel() <= 2:
emb = hs.new_zeros((0, hs.shape[-1]))
else:
core_idx = valid_idx[1:-1] # drop CLS and EOS
emb = hs[i, core_idx, :] # (L, H)
per_seq.append(emb)
lengths.append(int(emb.shape[0]))
Lmax = max(lengths) if lengths else 0
H = hs.shape[-1]
X = hs.new_zeros((len(sequences), Lmax, H), dtype=torch.float32)
M = torch.zeros((len(sequences), Lmax), dtype=torch.bool, device=self.device)
for i, emb in enumerate(per_seq):
L = emb.shape[0]
if L == 0:
continue
X[i, :L, :] = emb.to(torch.float32)
M[i, :L] = True
return X, M
@torch.no_grad()
def predict_raw(
self,
input_seqs: List[str],
batch_size: int = 16,
) -> np.ndarray:
"""
Returns the regressor output in the same space as training target_col:
- if trained on log_label -> returns log1p(hours)
- if trained on label -> returns hours (or whatever label scale was)
"""
if len(input_seqs) == 0:
return np.array([], dtype=np.float32)
preds = []
for i in range(0, len(input_seqs), batch_size):
batch = input_seqs[i : i + batch_size]
X, M = self._embed_unpooled_batch(batch)
yhat = self.regressor(X, M) # (B,)
preds.append(yhat.detach().cpu().numpy().astype(np.float32))
return np.concatenate(preds, axis=0)
def predict_hours(self, input_seqs: List[str], batch_size: int = 16) -> np.ndarray:
"""
If your model was trained on log_label, convert back to hours via expm1.
Otherwise returns raw predictions.
"""
raw = self.predict_raw(input_seqs, batch_size=batch_size)
if self.target_col == "log_label":
return np.expm1(raw).astype(np.float32)
return raw.astype(np.float32)
def __call__(self, input_seqs: List[str], batch_size: int = 16) -> np.ndarray:
return self.predict_hours(input_seqs, batch_size=batch_size)
def unittest():
ckpt_path = "../classifier_ckpt/wt_halflife.pt"
halflife = HalflifeTransformer(ckpt_path=ckpt_path)
seqs = ["MWQRPSSWIEGRFPHSDAVFTDQYTRLRKQLAAKKYLQSLKQKRY"]
pred = halflife(seqs)
print("pred_hours:", pred)
if __name__ == "__main__":
unittest()