Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions skills/detection/yolo-detection-2026/requirements_mps.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# YOLO 2026 — MPS (Apple Silicon) requirements
# Standard PyTorch — MPS backend is included by default on macOS
torch>=2.4.0
torchvision>=0.19.0
ultralytics>=8.3.0
coremltools>=8.0
# Uses ONNX Runtime + CoreML EP for GPU/ANE acceleration.
# Pre-built yolo26n.onnx is shipped in the repo, so torch/ultralytics
# are NOT needed at runtime — only onnxruntime for inference.
onnxruntime>=1.19.0
numpy>=1.24.0,<2.0.0
opencv-python-headless>=4.8.0
Pillow>=10.0.0

248 changes: 216 additions & 32 deletions skills/detection/yolo-detection-2026/scripts/env_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,12 @@ class BackendSpec:
),
"mps": BackendSpec(
name="mps",
export_format="coreml",
model_suffix=".mlpackage",
half=True,
extra_export_args={"nms": False},
compute_units="cpu_and_ne", # Route to Neural Engine, leave GPU free for LLM/VLM
export_format="onnx",
model_suffix=".onnx",
half=False, # ONNX Runtime handles precision internally
# ONNX Runtime + CoreMLExecutionProvider bypasses the broken
# MPSGraphExecutable MLIR pipeline on macOS 26.x while still
# leveraging GPU/ANE via CoreML under the hood.
),
"intel": BackendSpec(
name="intel",
Expand All @@ -78,6 +79,106 @@ class BackendSpec:
),
}

# ─── ONNX + CoreML EP wrapper ────────────────────────────────────────────────
# Provides an ultralytics-compatible model interface using onnxruntime directly
# with CoreMLExecutionProvider for ~6ms inference on Apple Silicon (vs 21ms when
# ultralytics defaults to CPUExecutionProvider).

class _BoxResult:
"""Minimal replacement for ultralytics Boxes result."""
__slots__ = ('xyxy', 'conf', 'cls')

def __init__(self, xyxy, conf, cls):
self.xyxy = xyxy # [[x1,y1,x2,y2]]
self.conf = conf # [conf]
self.cls = cls # [cls_id]


class _DetResult:
"""Minimal replacement for ultralytics Results."""
__slots__ = ('boxes',)

def __init__(self, boxes: list):
self.boxes = boxes


class _OnnxCoreMLModel:
"""ONNX Runtime model with CoreML EP, compatible with ultralytics API.

Supports: model(image_path_or_pil, conf=0.5, verbose=False)
Returns: list of _DetResult with .boxes iterable of _BoxResult
"""

def __init__(self, session, class_names: dict):
self.session = session
self.names = class_names
self._input_name = session.get_inputs()[0].name
# Expected input shape: [1, 3, H, W]
shape = session.get_inputs()[0].shape
self._input_h = shape[2] if isinstance(shape[2], int) else 640
self._input_w = shape[3] if isinstance(shape[3], int) else 640

def __call__(self, source, conf: float = 0.25, verbose: bool = True, **kwargs):
"""Run inference on an image path or PIL Image."""
import numpy as np
from PIL import Image

# Load image
if isinstance(source, str):
img = Image.open(source).convert("RGB")
elif isinstance(source, Image.Image):
img = source.convert("RGB")
else:
img = Image.fromarray(source).convert("RGB")

orig_w, orig_h = img.size

# Letterbox resize to input size
scale = min(self._input_w / orig_w, self._input_h / orig_h)
new_w, new_h = int(orig_w * scale), int(orig_h * scale)
img_resized = img.resize((new_w, new_h), Image.BILINEAR)

# Pad to input size (center)
pad_x = (self._input_w - new_w) // 2
pad_y = (self._input_h - new_h) // 2
canvas = np.full((self._input_h, self._input_w, 3), 114, dtype=np.uint8)
canvas[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = np.array(img_resized)

# HWC→CHW, normalize, add batch dim
blob = canvas.transpose(2, 0, 1).astype(np.float32) / 255.0
blob = np.expand_dims(blob, 0)

# Run inference
outputs = self.session.run(None, {self._input_name: blob})
preds = outputs[0] # shape: [1, num_detections, 6]

# Parse detections: [x1, y1, x2, y2, confidence, class_id]
boxes = []
for det in preds[0]:
det_conf = float(det[4])
if det_conf < conf:
continue

# Scale coordinates back to original image space
x1 = (float(det[0]) - pad_x) / scale
y1 = (float(det[1]) - pad_y) / scale
x2 = (float(det[2]) - pad_x) / scale
y2 = (float(det[3]) - pad_y) / scale

# Clip to image bounds
x1 = max(0, min(x1, orig_w))
y1 = max(0, min(y1, orig_h))
x2 = max(0, min(x2, orig_w))
y2 = max(0, min(y2, orig_h))

boxes.append(_BoxResult(
xyxy=np.array([[x1, y1, x2, y2]]),
conf=np.array([det_conf]),
cls=np.array([int(det[5])]),
))

return [_DetResult(boxes)]


# ─── Hardware detection ──────────────────────────────────────────────────────

Expand Down Expand Up @@ -133,31 +234,79 @@ def detect() -> "HardwareEnv":
return env

def _try_cuda(self) -> bool:
"""Detect NVIDIA GPU via nvidia-smi and torch."""
if not shutil.which("nvidia-smi"):
return False
"""Detect NVIDIA GPU via nvidia-smi (with Windows path search) and WMI fallback."""
nvidia_smi = shutil.which("nvidia-smi")

# Windows: check well-known paths if not on PATH
if not nvidia_smi and platform.system() == "Windows":
for candidate in [
Path(os.environ.get("PROGRAMFILES", r"C:\Program Files"))
/ "NVIDIA Corporation" / "NVSMI" / "nvidia-smi.exe",
Path(os.environ.get("WINDIR", r"C:\Windows"))
/ "System32" / "nvidia-smi.exe",
]:
if candidate.is_file():
nvidia_smi = str(candidate)
_log(f"Found nvidia-smi at {nvidia_smi}")
break

if nvidia_smi:
try:
result = subprocess.run(
[nvidia_smi, "--query-gpu=name,memory.total,driver_version",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=10,
)
if result.returncode == 0:
line = result.stdout.strip().split("\n")[0]
parts = [p.strip() for p in line.split(",")]
if len(parts) >= 3:
self.backend = "cuda"
self.device = "cuda"
self.gpu_name = parts[0]
self.gpu_memory_mb = int(float(parts[1]))
self.driver_version = parts[2]
self.detection_details["nvidia_smi"] = line
_log(f"NVIDIA GPU: {self.gpu_name} ({self.gpu_memory_mb}MB, driver {self.driver_version})")
return True
except (subprocess.TimeoutExpired, FileNotFoundError, ValueError) as e:
_log(f"nvidia-smi probe failed: {e}")

# Windows WMI fallback: detect NVIDIA GPU even without nvidia-smi on PATH
if platform.system() == "Windows":
return self._try_cuda_wmi()

return False

def _try_cuda_wmi(self) -> bool:
"""Windows-only: detect NVIDIA GPU via WMI (wmic)."""
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=name,memory.total,driver_version",
"--format=csv,noheader,nounits"],
["wmic", "path", "win32_VideoController", "get",
"Name,AdapterRAM,DriverVersion", "/format:csv"],
capture_output=True, text=True, timeout=10,
)
if result.returncode != 0:
return False

line = result.stdout.strip().split("\n")[0]
parts = [p.strip() for p in line.split(",")]
if len(parts) >= 3:
self.backend = "cuda"
self.device = "cuda"
self.gpu_name = parts[0]
self.gpu_memory_mb = int(float(parts[1]))
self.driver_version = parts[2]
self.detection_details["nvidia_smi"] = line
_log(f"NVIDIA GPU: {self.gpu_name} ({self.gpu_memory_mb}MB, driver {self.driver_version})")
return True
for line in result.stdout.strip().split("\n"):
if "NVIDIA" in line.upper():
parts = [p.strip() for p in line.split(",")]
# CSV format: Node,AdapterRAM,DriverVersion,Name
if len(parts) >= 4:
self.backend = "cuda"
self.device = "cuda"
self.gpu_name = parts[3]
try:
self.gpu_memory_mb = int(int(parts[1]) / (1024 * 1024))
except (ValueError, IndexError):
pass
self.driver_version = parts[2] if len(parts) > 2 else ""
self.detection_details["wmi"] = line
_log(f"NVIDIA GPU (WMI): {self.gpu_name} ({self.gpu_memory_mb}MB)")
return True
except (subprocess.TimeoutExpired, FileNotFoundError, ValueError) as e:
_log(f"nvidia-smi probe failed: {e}")
_log(f"WMI probe failed: {e}")
return False

def _try_rocm(self) -> bool:
Expand Down Expand Up @@ -363,12 +512,28 @@ def _check_rocm_runtime(self):
_log("Fix: pip uninstall onnxruntime && pip install onnxruntime-rocm")
raise ImportError("ROCmExecutionProvider not available")

def _check_mps_runtime(self):
"""Verify onnxruntime has CoreML provider for Apple GPU/ANE acceleration.

ONNX Runtime + CoreMLExecutionProvider bypasses the broken
MPSGraphExecutable MLIR pipeline (macOS 26.x) while still routing
inference through CoreML to leverage GPU and Neural Engine.
"""
import onnxruntime
providers = onnxruntime.get_available_providers()
if "CoreMLExecutionProvider" in providers:
_log(f"onnxruntime CoreML provider available: {providers}")
return True
_log(f"onnxruntime providers: {providers} — CoreMLExecutionProvider not found")
_log("Fix: pip install onnxruntime (arm64 macOS wheel includes CoreML EP)")
raise ImportError("CoreMLExecutionProvider not available")

def _check_framework(self) -> bool:
"""Check if the optimized inference runtime is importable."""
"""Check if the optimized inference runtime is importable and compatible."""
checks = {
"cuda": lambda: __import__("tensorrt"),
"rocm": lambda: self._check_rocm_runtime(),
"mps": lambda: __import__("coremltools"),
"mps": lambda: self._check_mps_runtime(),
"intel": lambda: __import__("openvino"),
"cpu": lambda: __import__("onnxruntime"),
}
Expand Down Expand Up @@ -496,6 +661,27 @@ def __init__(self, *args, **kwargs):
_log("coremltools not available, loading without compute_units")
return YOLO(model_path)

def _load_onnx_coreml(self, onnx_path: str):
"""Load ONNX model with CoreMLExecutionProvider for fast GPU/ANE inference.

Returns an OnnxCoreMLModel wrapper that is compatible with the
ultralytics model(frame_path, conf=...) call pattern.
"""
import onnxruntime as ort

providers = ['CoreMLExecutionProvider', 'CPUExecutionProvider']
session = ort.InferenceSession(onnx_path, providers=providers)
active = session.get_providers()
_log(f"ONNX+CoreML session: {active}")

# Get YOLO class names from the .pt model (needed for detection output)
from ultralytics import YOLO
pt_path = onnx_path.replace('.onnx', '.pt')
pt_model = YOLO(pt_path)
class_names = pt_model.names # {0: 'person', 1: 'bicycle', ...}

return _OnnxCoreMLModel(session, class_names)

def load_optimized(self, model_name: str, use_optimized: bool = True):
"""
Load the best available model for this hardware.
Expand All @@ -512,10 +698,9 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
optimized_path = self.get_optimized_path(model_name)
if optimized_path.exists():
try:
# On Apple Silicon: route CoreML to Neural Engine
if self.backend == "mps" and self.compute_units != "all":
model = self._load_coreml_with_compute_units(
str(optimized_path))
# MPS: use ONNX Runtime + CoreML EP for fast inference
if self.backend == "mps":
model = self._load_onnx_coreml(str(optimized_path))
else:
model = YOLO(str(optimized_path))
self.load_ms = (time.perf_counter() - t0) * 1000
Expand All @@ -529,10 +714,9 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
exported = self.export_model(pt_model, model_name)
if exported:
try:
# On Apple Silicon: route CoreML to Neural Engine
if self.backend == "mps" and self.compute_units != "all":
model = self._load_coreml_with_compute_units(
str(exported))
# MPS: use ONNX Runtime + CoreML EP for fast inference
if self.backend == "mps":
model = self._load_onnx_coreml(str(exported))
else:
model = YOLO(str(exported))
self.load_ms = (time.perf_counter() - t0) * 1000
Expand Down
Binary file added skills/detection/yolo-detection-2026/yolo26n.onnx
Binary file not shown.
Loading
Loading