Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Tests

on:
pull_request:
push:
branches: [main]
workflow_dispatch:

jobs:
test-suite:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install deps
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt

- name: Run unit, integration, and e2e tests
run: pytest -m "unit or integration or e2e" --junitxml=pytest-all.xml

- name: Upload test + coverage artifacts
uses: actions/upload-artifact@v4
with:
name: test-suite-reports
path: |
pytest-all.xml
coverage.xml
26 changes: 24 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,14 +1,35 @@
# Python cache and local environments
__pycache__/
.ipynb_checkpoints
.ipynb_checkpoints/
.env
.envrc
.venv
env/
venv/
ENV/

# Editor settings
.vscode/

# Local build and smoke-test output
smoketest/
.build_pyz/
build/
dist/
pip-wheel-metadata/

# Test runner and coverage artifacts
.pytest_cache/
.pytest_tmp*/
.coverage
.coverage.*
coverage.xml
htmlcov/
pytest-*.xml

# Project-generated local data/artifacts
*.egg-info/
.eggs/
*.fasta
*.csv
*.xlsx
Expand All @@ -18,4 +39,5 @@ smoketest/
*.txt
*.pyz
*.png
*.metadata
*.metadata
*.json
65 changes: 65 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
[build-system]
requires = ["setuptools>=69", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "pepseqpred"
version = "1.0.0rc1"
description = "Residue-level epitope prediction pipeline for peptide/protein workflows."
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"numpy>=2.3,<3",
"pandas>=2.3,<3",
"torch>=2.4,<3",
"fair-esm==2.0.0",
"scikit-learn>=1.5,<2",
"optuna>=3.5,<5"
]

[project.optional-dependencies]
dev = [
"pytest>=8.0",
"pytest-cov>=5.0",
"pytest-mock>=3.14",
"ruff>=0.6"
]

[tool.pytest.ini_options]
minversion = "8.0"
addopts = "-ra --strict-markers --cov=pepseqpred --cov-report=term-missing --cov-report=xml --cov-fail-under=75"
testpaths = ["tests"]
pythonpath = ["src"]
markers = [
"unit: fast isolated tests",
"integration: component interaction tests",
"e2e: end-to-end pipeline tests",
"slow: longer-running tests"
]

[tool.coverage.run]
branch = true
source = ["pepseqpred"]
omit = [
"tests/*"
]

[tool.coverage.report]
show_missing = true
skip_empty = true
precision = 2

[project.scripts]
pepseqpred-esm = "pepseqpred.apps.esm_cli:main"
pepseqpred-labels = "pepseqpred.apps.labels_cli:main"
pepseqpred-predict = "pepseqpred.apps.prediction_cli:main"
pepseqpred-preprocess = "pepseqpred.apps.preprocess_cli:main"
pepseqpred-train-ffnn = "pepseqpred.apps.train_ffnn_cli:main"
pepseqpred-train-ffnn-optuna = "pepseqpred.apps.train_ffnn_optuna_cli:main"

[tool.setuptools]
package-dir = {"" = "src"}

[tool.setuptools.packages.find]
where = ["src"]
include = ["pepseqpred*"]
Binary file modified requirements.txt
Binary file not shown.
3 changes: 2 additions & 1 deletion src/pepseqpred/apps/prediction_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,8 @@ def main() -> None:
layer = esm_model.num_layers

# load model from disk
checkpoint = torch.load(args.checkpoint, map_location="cpu")
checkpoint = torch.load(
args.checkpoint, map_location="cpu", weights_only=True)
cli_model_cfg = _build_cli_model_config(args)
psp_model, model_cfg, model_cfg_src = build_model_from_checkpoint(
checkpoint,
Expand Down
2 changes: 1 addition & 1 deletion src/pepseqpred/apps/preprocess_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def main() -> None:
help="Prefix for subject column labels in z-score reactivity data.")
parser.add_argument("--save",
action="store_true",
dest="save_path",
dest="save",
default=False,
help="Store results in a .tsv output file to be used in model training.")

Expand Down
2 changes: 1 addition & 1 deletion src/pepseqpred/core/labels/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def _find_pt_path(self, protein_id: str) -> Path:
def _load_embedding_length(self, protein_id: str) -> int:
"""Finds .pt path, loads embedding as tensor, and returns the length (number of amino acids)."""
pt_path = self._find_pt_path(protein_id)
embedding = torch.load(pt_path, map_location="cpu")
embedding = torch.load(pt_path, map_location="cpu", weights_only=True)
if not isinstance(embedding, torch.Tensor) or embedding.dim() != 2:
raise ValueError(
f"Expected 2D tensor embedding for '{protein_id}', got {type(embedding)}")
Expand Down
43 changes: 35 additions & 8 deletions src/pepseqpred/core/train/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from labels, predictions, and probabilities.
"""

from typing import Dict, Any
from typing import Dict, Any, Union, Sequence
import numpy as np
import torch
from sklearn.metrics import (precision_recall_fscore_support,
average_precision_score,
Expand All @@ -16,7 +17,16 @@
auc)


def compute_eval_metrics(y_true: torch.Tensor, y_pred: torch.Tensor, y_prob: torch.Tensor) -> Dict[str, Any]:
ArrayLike1D = Union[torch.Tensor, np.ndarray, Sequence[float], Sequence[int]]


def _to_numpy_1d(x: ArrayLike1D) -> np.ndarray:
if isinstance(x, torch.Tensor):
return x.detach().cpu().numpy().reshape(-1)
return np.asarray(x).reshape(-1)


def compute_eval_metrics(y_true: ArrayLike1D, y_pred: ArrayLike1D, y_prob: ArrayLike1D) -> Dict[str, Any]:
"""
Computes evaluation metrics given true lables, predicted labels, and predicted probabilities.

Expand All @@ -36,31 +46,48 @@ def compute_eval_metrics(y_true: torch.Tensor, y_pred: torch.Tensor, y_prob: tor
"""
metrics: Dict[str, Any] = {}

# calculate precesion, recall, f1, and mcc
y_true_np = _to_numpy_1d(y_true).astype(np.int64, copy=False)
y_pred_np = _to_numpy_1d(y_pred).astype(np.int64, copy=False)
y_prob_np = _to_numpy_1d(y_prob).astype(np.float64, copy=False)

# calculate precision, recall, and f1
precision, recall, f1, _ = precision_recall_fscore_support(
y_true, y_pred, average="binary", zero_division=0)
y_true_np, y_pred_np, average="binary", zero_division=0)
metrics["precision"] = float(precision)
metrics["recall"] = float(recall)
metrics["f1"] = float(f1)
metrics["mcc"] = matthews_corrcoef(y_true, y_pred)

# Avoid sklearn warning when both tensors contain only one shared label.
if np.unique(np.concatenate((y_true_np, y_pred_np))).size < 2:
metrics["mcc"] = 0.0
else:
metrics["mcc"] = float(matthews_corrcoef(y_true_np, y_pred_np))

has_both_classes = np.unique(y_true_np).size >= 2
if not has_both_classes:
only_class = int(y_true_np[0]) if y_true_np.size > 0 else 0
metrics["auc"] = float("nan")
metrics["pr_auc"] = 1.0 if only_class == 1 else 0.0
metrics["auc10"] = float("nan")
return metrics

# ROC AUC
try:
metrics["auc"] = float(roc_auc_score(y_true, y_prob))
metrics["auc"] = float(roc_auc_score(y_true_np, y_prob_np))

except Exception:
metrics["auc"] = float("nan")

# PR AUC
try:
metrics["pr_auc"] = float(average_precision_score(y_true, y_prob))
metrics["pr_auc"] = float(average_precision_score(y_true_np, y_prob_np))

except Exception:
metrics["pr_auc"] = float("nan")

# AUC10 calculation]
try:
fpr, tpr, _ = roc_curve(y_true, y_prob)
fpr, tpr, _ = roc_curve(y_true_np, y_prob_np)
mask = fpr <= 0.10
if mask.sum() >= 2:
metrics["auc10"] = float(auc(fpr[mask], tpr[mask]) / 0.10)
Expand Down
12 changes: 11 additions & 1 deletion src/pepseqpred/core/train/weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,5 +100,15 @@ def pos_weight_from_label_shards(label_shards: List[Path]) -> float:
f"{shard} missing class_stats (rebuild labels with --calc-pos-weight)"
)
total_pos += int(stats["pos_count"])
total_neg += int(stats["neg_counts"])
# Prefer the canonical key written by labels.builder, but support
# legacy/pluralized payloads for backwards compatibility.
if "neg_count" in stats:
total_neg += int(stats["neg_count"])
elif "neg_counts" in stats:
total_neg += int(stats["neg_counts"])
else:
raise ValueError(
f"{shard} class_stats missing negative count key "
"(expected 'neg_count' or 'neg_counts')"
)
return float(total_neg / max(1, total_pos))
36 changes: 36 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from pathlib import Path
import pytest
import torch


@pytest.fixture
def training_artifacts(tmp_path: Path):
emb_dir = tmp_path / "emb"
emb_dir.mkdir(parents=True, exist_ok=True)
label_shard = tmp_path / "labels_000.pt"

labels = {}
pos = 0
neg = 0

for protein_id, family in [
("P001", "111"), ("P002", "111"), ("P003", "222"), ("P004", "222")
]:
x = torch.randn(6, 4, dtype=torch.float32)
torch.save(x, emb_dir / f"{protein_id}-{family}.pt")

y = torch.tensor([1, 0, 0, 1, 0, 0], dtype=torch.float32)
labels[protein_id] = y
pos += int((y == 1).sum().item())
neg += int((y == 0).sum().item())

payload = {
"labels": labels,
"class_stats": {
"pos_count": pos,
"neg_count": neg
}
}
torch.save(payload, label_shard)

return {"embedding_dir": emb_dir, "label_shard": label_shard}
Loading
Loading