diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bbf59af..b109223 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: files: '^.*\.py' types: [file] - repo: https://github.com/pycqa/flake8 - rev: 5.0.4 + rev: 7.3.0 hooks: - id: flake8 files: '^.*\.py' diff --git a/README.md b/README.md index 99721c7..a051fd8 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ - [Sequential PCD](#sequential-pcd) - [DICOM](#dicom) - [Robotics](#robotics) + - [Import LeRobot Dataset](#import-lerobot-dataset) - [Common](#common) - [Appendix](#appendix) - [Annotation](#annotation) @@ -2207,6 +2208,37 @@ history = client.import_robotics_contents_file( ) ``` +#### Import LeRobot Dataset + +Import a [LeRobot](https://github.com/huggingface/lerobot) dataset (v3) into a FastLabel robotics project. + +Requires additional dependencies: + +```bash +pip install fastlabel[robotics] +``` + +For each episode, this method creates a robotics task and uploads the video files and frame data (converted from parquet to JSON). + +```python +results = client.import_lerobot( + project="YOUR_PROJECT_SLUG", + lerobot_data_path="/path/to/lerobot/dataset", +) +``` + +You can also specify which episodes to import: + +```python +results = client.import_lerobot( + project="YOUR_PROJECT_SLUG", + lerobot_data_path="/path/to/lerobot/dataset", + episode_indices=[0, 1, 2], +) +``` + +> **Note:** Only LeRobot dataset v3 is supported. v2 datasets need to be converted to v3 before importing. + ### Common APIs for update and delete and count are same over all tasks. diff --git a/examples/import_lerobot.py b/examples/import_lerobot.py new file mode 100644 index 0000000..387a186 --- /dev/null +++ b/examples/import_lerobot.py @@ -0,0 +1,25 @@ +""" +Import a LeRobot dataset into a FastLabel robotics project. + +Requires: pip install fastlabel[robotics] + +Supports LeRobot v3 dataset format only. + v3: data/chunk-*/file-*.parquet, videos/.../chunk-*/file-*.mp4 +""" + +from fastlabel import Client + +client = Client() + +# Import all episodes +results = client.import_lerobot( + project="your-project-slug", + lerobot_data_path="/path/to/lerobot/dataset", +) + +# Import specific episodes by index +results = client.import_lerobot( + project="your-project-slug", + lerobot_data_path="/path/to/lerobot/dataset", + episode_indices=[0, 1, 2], +) diff --git a/fastlabel/__init__.py b/fastlabel/__init__.py index 1394fb7..56d4030 100644 --- a/fastlabel/__init__.py +++ b/fastlabel/__init__.py @@ -3,6 +3,7 @@ import logging import os import re +import shutil import urllib.parse from concurrent.futures import ThreadPoolExecutor, wait from pathlib import Path @@ -14,7 +15,7 @@ import xmltodict from PIL import Image, ImageColor, ImageDraw -from fastlabel import const, converters, utils +from fastlabel import const, converters, lerobot, utils from fastlabel.const import ( EXPORT_IMAGE_WITH_ANNOTATIONS_SUPPORTED_IMAGE_TYPES, KEYPOINT_MIN_STROKE_WIDTH, @@ -28,7 +29,7 @@ ) from .api import Api -from .exceptions import FastLabelInvalidException +from .exceptions import FastLabelException, FastLabelInvalidException from .query import DatasetObjectGetQuery logger = logging.getLogger(__name__) @@ -2072,6 +2073,64 @@ def create_robotics_task( return self.api.post_request(endpoint, payload=payload) + def import_lerobot( + self, + project: str, + lerobot_data_path: str, + episode_indices: list = None, + ) -> list: + """ + Import a LeRobot dataset into a FastLabel robotics project. + + Automatically detects LeRobot dataset version (v3). + For each episode, creates a robotics task and uploads the video files + and frame data (converted from parquet to JSON). + + Requires: pip install fastlabel[robotics] + + project is slug of your project (Required). + lerobot_data_path is the path to the LeRobot dataset directory (Required). + episode_indices is a list of episode indices to import. + If None, all episodes are imported (Optional). + """ + data_path = Path(lerobot_data_path) + episode_map = lerobot.build_episode_map(data_path) + if episode_indices is None: + episode_indices = sorted(episode_map.keys()) + + results = [] + for episode_index in episode_indices: + episode_name = lerobot.format_episode_name(episode_index) + self.create_robotics_task(project=project, name=episode_name) + + zip_path = lerobot.create_episode_zip( + lerobot_data_path=data_path, + episode_index=episode_index, + episode_map=episode_map, + ) + try: + result = self.import_robotics_contents_file( + project=project, file_path=zip_path + ) + results.append( + {"episode": episode_name, "success": True, "result": result} + ) + except FastLabelException as e: + results.append( + { + "episode": episode_name, + "success": False, + "result": {"error": str(e)}, + } + ) + finally: + zip_file = Path(zip_path) + tmp_dir = zip_file.parent + if tmp_dir.exists(): + shutil.rmtree(tmp_dir) + + return results + def import_appendix_file( self, project: str, diff --git a/fastlabel/lerobot/__init__.py b/fastlabel/lerobot/__init__.py new file mode 100644 index 0000000..9654c44 --- /dev/null +++ b/fastlabel/lerobot/__init__.py @@ -0,0 +1,62 @@ +from fastlabel.exceptions import FastLabelInvalidException +from fastlabel.lerobot import v3 +from fastlabel.lerobot.common import ( + check_dependencies, + detect_version, + format_episode_name, + get_camera_dirs, +) + +__all__ = [ + "build_episode_map", + "get_episode_indices", + "create_episode_zip", + "format_episode_name", + "get_camera_dirs", +] + + +def get_episode_indices(lerobot_data_path): + """Get all episode indices from a LeRobot v3 dataset.""" + check_dependencies() + version = detect_version(lerobot_data_path) + if version == "v2": + raise FastLabelInvalidException( + "LeRobot dataset v2 is not supported. Please convert to v3.", + 422, + ) + return v3.get_episode_indices(lerobot_data_path) + + +def build_episode_map(lerobot_data_path): + """Build episode map from dataset. Returns a dict keyed by episode index.""" + check_dependencies() + version = detect_version(lerobot_data_path) + if version == "v2": + raise FastLabelInvalidException( + "LeRobot dataset v2 is not supported. Please convert to v3.", + 422, + ) + return v3._build_episode_map(lerobot_data_path) + + +def create_episode_zip(lerobot_data_path, episode_index, episode_map=None): + """Create a ZIP file for a single episode in the format expected by FastLabel. + + Supports LeRobot dataset v3 only. + + ZIP structure (files at root, ZIP name = episode name): + {content_name}.mp4 (one per camera) + {episode_name}.json (frame data) + + Returns the path to the created ZIP file. + The caller is responsible for cleaning up the returned ZIP file. + """ + check_dependencies() + version = detect_version(lerobot_data_path) + if version == "v2": + raise FastLabelInvalidException( + "LeRobot dataset v2 is not supported. Please convert to v3.", + 422, + ) + return v3.create_episode_zip(lerobot_data_path, episode_index, episode_map) diff --git a/fastlabel/lerobot/common.py b/fastlabel/lerobot/common.py new file mode 100644 index 0000000..d034a60 --- /dev/null +++ b/fastlabel/lerobot/common.py @@ -0,0 +1,73 @@ +from pathlib import Path + +from fastlabel.exceptions import FastLabelInvalidException + + +def check_dependencies(): + try: + import pandas # noqa: F401 + import pyarrow # noqa: F401 + except ImportError: + raise FastLabelInvalidException( + "pandas and pyarrow are required for LeRobot support. " + "Install them with: pip install fastlabel[robotics]", + 422, + ) + + +def detect_version(lerobot_data_path: Path) -> str: + """Detect LeRobot dataset version (v2 or v3). + + Both versions use data/chunk-XXX/ directories. + v2: data/chunk-XXX/episode_YYYYYY.parquet + v3: data/chunk-XXX/file-YYY.parquet + """ + data_dir = lerobot_data_path / "data" + if not data_dir.exists(): + raise FastLabelInvalidException(f"Data directory not found: {data_dir}", 422) + + for chunk_dir in data_dir.iterdir(): + if not chunk_dir.is_dir() or not chunk_dir.name.startswith("chunk-"): + continue + for f in chunk_dir.iterdir(): + if f.suffix != ".parquet": + continue + if f.stem.startswith("episode_"): + return "v2" + if f.stem.startswith("file-"): + return "v3" + + raise FastLabelInvalidException( + "Could not detect LeRobot dataset version. " + "Expected data/chunk-XXX/episode_*.parquet (v2) " + "or data/chunk-XXX/file-*.parquet (v3).", + 422, + ) + + +def format_episode_name(episode_index: int) -> str: + return f"episode_{episode_index:06d}" + + +def get_camera_dirs(lerobot_data_path: Path) -> list: + """Get camera directories and their content names. + Returns [(camera_dir, content_name), ...]. + e.g. observation.images.top -> content_name = "images_top" + """ + videos_dir = lerobot_data_path / "videos" + if not videos_dir.exists(): + return [] + + results = [] + for obs_dir in sorted(videos_dir.iterdir()): + if not obs_dir.is_dir(): + continue + parts = obs_dir.name.split(".") + if parts[0] != "observation": + raise FastLabelInvalidException( + f"Unexpected camera dir name: {obs_dir.name}" + ) + + content_name = "_".join(parts[1:]) + results.append((obs_dir, content_name)) + return results diff --git a/fastlabel/lerobot/v3.py b/fastlabel/lerobot/v3.py new file mode 100644 index 0000000..9a51801 --- /dev/null +++ b/fastlabel/lerobot/v3.py @@ -0,0 +1,155 @@ +import json +import shutil +import tempfile +from pathlib import Path + +import cv2 + +from fastlabel.exceptions import FastLabelInvalidException +from fastlabel.lerobot.common import format_episode_name, get_camera_dirs + + +def _build_episode_map(lerobot_data_path: Path) -> dict: + """Build a mapping of episode_index -> {chunk, file_stem, frame_offset, length}. + + Reads all data parquet files across all chunks and computes per-episode + frame offsets within each file (needed for video segment extraction). + + v3 layout: data/chunk-XXX/file-YYY.parquet + """ + import pandas as pd + + data_dir = lerobot_data_path / "data" + episode_map = {} + + for chunk_dir in sorted(data_dir.iterdir()): + if not chunk_dir.is_dir() or not chunk_dir.name.startswith("chunk-"): + continue + chunk_name = chunk_dir.name + + for parquet_file in sorted(chunk_dir.glob("file-*.parquet")): + file_stem = parquet_file.stem + df = pd.read_parquet(parquet_file) + + frame_offset = 0 + for ep_idx in sorted(df["episode_index"].unique()): + ep_df = df[df["episode_index"] == ep_idx] + length = len(ep_df) + episode_map[int(ep_idx)] = { + "chunk": chunk_name, + "file_stem": file_stem, + "frame_offset": frame_offset, + "length": length, + } + frame_offset += length + + return episode_map + + +def get_episode_indices(lerobot_data_path: Path) -> list: + """Get all episode indices from a v3 dataset.""" + episode_map = _build_episode_map(lerobot_data_path) + return sorted(episode_map.keys()) + + +def _convert_episode_frames( + lerobot_data_path: Path, episode_index: int, chunk: str, file_stem: str +) -> list: + """Extract frame dicts for a single episode from a v3 consolidated parquet.""" + import pandas as pd + + parquet_path = lerobot_data_path / "data" / chunk / f"{file_stem}.parquet" + df = pd.read_parquet(parquet_path) + ep_df = df[df["episode_index"] == episode_index] + + required_keys = ["observation.state", "action", "frame_index", "timestamp"] + if not all(key in ep_df.columns for key in required_keys): + return [] + + return [ + { + "observation.state": row["observation.state"].tolist(), + "action": row["action"].tolist(), + "frame_index": int(row["frame_index"]), + "timestamp": float(row["timestamp"]), + } + for _, row in ep_df.iterrows() + ] + + +def _extract_video_segment( + video_path: Path, start_frame: int, num_frames: int, output_path: Path +) -> None: + """Extract a segment of frames from a video file using OpenCV.""" + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + raise FastLabelInvalidException(f"Could not open video file: {video_path}", 422) + + fps = cap.get(cv2.CAP_PROP_FPS) + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + + writer = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height)) + cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame) + + try: + for _ in range(num_frames): + ret, frame = cap.read() + if not ret: + break + writer.write(frame) + finally: + writer.release() + cap.release() + + +def create_episode_zip( + lerobot_data_path: Path, episode_index: int, episode_map: dict = None +) -> str: + """Create a ZIP for a single v3 episode. + + v3 video layout: videos/{key}/chunk-XXX/file-YYY.mp4 + """ + if episode_map is None: + episode_map = _build_episode_map(lerobot_data_path) + + if episode_index not in episode_map: + raise FastLabelInvalidException( + f"Episode index {episode_index} not found in dataset.", + 422, + ) + + ep_info = episode_map[episode_index] + chunk = ep_info["chunk"] + file_stem = ep_info["file_stem"] + frame_offset = ep_info["frame_offset"] + length = ep_info["length"] + episode_name = format_episode_name(episode_index) + + tmp_dir = tempfile.mkdtemp() + content_dir = Path(tmp_dir) / "content" + content_dir.mkdir() + + # Extract video segments + # v3: videos/{key}/chunk-XXX/file-YYY.mp4 + for camera_dir, content_name in get_camera_dirs(lerobot_data_path): + video_path = camera_dir / chunk / f"{file_stem}.mp4" + if not video_path.exists(): + continue + output_path = content_dir / f"{content_name}.mp4" + _extract_video_segment(video_path, frame_offset, length, output_path) + + # Convert parquet to JSON + frames = _convert_episode_frames(lerobot_data_path, episode_index, chunk, file_stem) + json_path = content_dir / f"{episode_name}.json" + json_path.write_text(json.dumps(frames, ensure_ascii=False)) + + # Create ZIP (files at root, ZIP name = episode name) + zip_path = shutil.make_archive( + base_name=str(Path(tmp_dir) / episode_name), + format="zip", + root_dir=str(content_dir), + ) + shutil.rmtree(content_dir) + return zip_path diff --git a/pyproject.toml b/pyproject.toml index d999244..f5420d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,9 @@ dependencies = [ dynamic = ["version"] +[project.optional-dependencies] +robotics = ["pandas>=2.0.0", "pyarrow>=14.0.0"] + [tool.setuptools] include-package-data = true diff --git a/tox.ini b/tox.ini index 7b5a44d..2c15b04 100644 --- a/tox.ini +++ b/tox.ini @@ -6,5 +6,5 @@ multi_line_output = 3 [flake8] max-line-length = 88 -extend-ignore = E203,E501 +extend-ignore = E203,E231,E501 exclude = .git,__pycache__,.tox,.eggs,*.egg,build,dist,venv