Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ if __name__ == '__main__':
import os
from tqdm import tqdm
from tinker import types
from twinkle_client import init_tinker_client
from twinkle import init_tinker_client
from twinkle.dataloader import DataLoader
from twinkle.dataset import Dataset, DatasetMeta
from twinkle.preprocessor import SelfCognitionProcessor
Expand Down
2 changes: 1 addition & 1 deletion README_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ if __name__ == '__main__':
import os
from tqdm import tqdm
from tinker import types
from twinkle_client import init_tinker_client
from twinkle import init_tinker_client
from twinkle.dataloader import DataLoader
from twinkle.dataset import Dataset, DatasetMeta
from twinkle.preprocessor import SelfCognitionProcessor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,23 @@

# Step 1: Load environment variables from a .env file (e.g., API tokens)
import dotenv

dotenv.load_dotenv('.env')

import os

# Step 2: Initialize Tinker client before importing ServiceClient
from twinkle_client import init_tinker_client
from twinkle import init_tinker_client

init_tinker_client()

# Step 3: Use ServiceClient directly from tinker
from tinker import ServiceClient

service_client = ServiceClient(
base_url='http://www.modelscope.cn/twinkle',
api_key=os.environ.get('MODELSCOPE_TOKEN')
# BASE_URL can be a local server endpoint such as http://localhost:8000, or
# points to a previously deployed remote server, or
# modelscope server such as 'http://www.modelscope.cn/twinkle'
base_url='http://localhost:8000',
# API_KEY can be empty or a meaninful one according to sever configuration
api_key='EMPTY-TOKEN'
)

# Step 4: List models available on the server to verify the connection
Expand All @@ -40,10 +41,12 @@

# You can resume from either:
# 1. A twinkle path: "twinkle://.../<run_id>/weights/<checkpoint_name>"
# 2. A model id on hub: "<user>/<model_id>"
# 2. A model id on ModelScope hub: "ms://<user>/<model_id>"
# 3. A local path to a checkpoint directory
# Example:
# resume_path = "twinkle://20260131_170251-Qwen_Qwen2_5-0_5B-Instruct-7275126c/weights/pig-latin-lora-epoch-1"
# resume_path = "AlexEz/20260205_163645-Qwen_Qwen2_5-7B-Instruct-385d5c17_pig-latin-lora-epoch-1"
# resume_path = "ms://AlexEz/20260205_163645-Qwen_Qwen2_5-7B-Instruct-385d5c17_pig-latin-lora-epoch-1"
# resume_path = "/path/to/local/checkpoint/directory"
resume_path = ''

print(f'Found {len(response.training_runs)} training runs')
Expand All @@ -58,7 +61,7 @@

# Step 6: Create or resume a training client.
# If resume_path is set, it restores both model weights and optimizer state.
base_model = 'Qwen/Qwen2.5-7B-Instruct'
base_model = 'Qwen/Qwen3-4B'
if not resume_path:
training_client = service_client.create_lora_training_client(base_model=base_model)
else:
Expand All @@ -85,19 +88,7 @@
{
'input': 'pickle jar',
'output': 'ickle-pay ar-jay'
},
{
'input': 'space exploration',
'output': 'ace-spay exploration-way'
},
{
'input': 'rubber duck',
'output': 'ubber-ray uck-day'
},
{
'input': 'coding wizard',
'output': 'oding-cay izard-way'
},
}
]

from modelscope import AutoTokenizer
Expand Down Expand Up @@ -181,6 +172,7 @@ def process_example(example: dict, tokenizer) -> types.Datum:

# Step 9: Publish the final checkpoint to ModelScope Hub.
# NOTE: Requires a valid ModelScope token set as api_key when initializing the client.
# The published model name will be: {run_id}_{checkpoint_name}
# The model will be published under the owner of the supplied ModelScope token,
# with model name formatted as: {run_id}_{checkpoint_name}
rest_client.publish_checkpoint_from_tinker_path(save_result.path).result()
print('Published checkpoint')
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ applications:
import_path: server # Python module to import
args:
server_config:
per_token_model_limit: 1 # Maximum number of models (adapters) per token (server-globally enforced)
per_token_model_limit: 3 # Maximum number of models (adapters) per token (server-globally enforced)
supported_models:
- Qwen/Qwen2.5-7B-Instruct
- Qwen/Qwen3-4B
deployments:
- name: TinkerCompatServer
autoscaling_config:
Expand All @@ -36,17 +36,17 @@ applications:

# 2. Model Service (commented out) - Would host the base model for training.
# Uncomment and configure if you need a training model worker.
- name: models-Qwen2.5-7B-Instruct
route_prefix: /api/v1/model/Qwen/Qwen2.5-7B-Instruct
- name: models-Qwen3-4B
route_prefix: /api/v1/model/Qwen/Qwen3-4B
import_path: model
args:
use_megatron: true
model_id: "ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier
model_id: "ms://Qwen/Qwen3-4B" # ModelScope model identifier
max_length: 10240
nproc_per_node: 2 # Number of GPU processes per node
device_group:
name: model
ranks: [0,1] # GPU rank indices
ranks: 2 # GPU rank indices
device_type: cuda
device_mesh:
device_type: cuda
Expand All @@ -58,11 +58,12 @@ applications:
adapter_config:
adapter_timeout: 30 # Seconds before idle adapter unload
adapter_max_lifetime: 36000 # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
max_loras: 1 # Maximum number of LoRA adapters per model
deployments:
- name: ModelManagement
autoscaling_config:
min_replicas: 1
max_replicas: 1
min_replicas: 2
max_replicas: 2
target_ongoing_requests: 16
ray_actor_options:
num_cpus: 0.1
Expand All @@ -72,36 +73,36 @@ applications:

# 3. Sampler Service - Runs inference / sampling using vLLM engine
# Used for generating text from the model (e.g., evaluating LoRA results).
# - name: sampler-Qwen2.5-7B-Instruct
# route_prefix: /api/v1/sampler/Qwen/Qwen2.5-7B-Instruct
# import_path: sampler
# args:
# model_id: "ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier
# nproc_per_node: 2 # Number of GPU processes per node
# sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
# engine_args: # vLLM engine-specific settings
# max_model_len: 4096 # Maximum sequence length the engine supports
# gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0)
# enable_lora: true # Allow loading LoRA adapters during inference
# logprobs_mode: processed_logprobs # Logprobs mode for sampling results
# device_group: # Logical device group for the sampler
# name: sampler
# ranks: [2] # GPU rank indices to use
# device_type: cuda
# device_mesh:
# device_type: cuda
# dp_size: 1
# queue_config:
# rps_limit: 100 # Max requests per second
# tps_limit: 100000 # Max tokens per second
# deployments:
# - name: SamplerManagement
# autoscaling_config:
# min_replicas: 1
# max_replicas: 1
# target_ongoing_requests: 16
# ray_actor_options:
# num_cpus: 0.1
# runtime_env:
# env_vars:
# TWINKLE_TRUST_REMOTE_CODE: "0"
- name: sampler-Qwen3-4B
route_prefix: /api/v1/sampler/Qwen/Qwen3-4B
import_path: sampler
args:
model_id: "ms://Qwen/Qwen3-4B" # ModelScope model identifier
nproc_per_node: 2 # Number of GPU processes per node
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
engine_args: # vLLM engine-specific settings
max_model_len: 4096 # Maximum sequence length the engine supports
gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0)
enable_lora: true # Allow loading LoRA adapters during inference
logprobs_mode: processed_logprobs # Logprobs mode for sampling results
device_group: # Logical device group for the sampler
name: sampler
ranks: 1 # Number of GPUs to use
device_type: cuda
device_mesh:
device_type: cuda
dp_size: 1
queue_config:
rps_limit: 100 # Max requests per second
tps_limit: 100000 # Max tokens per second
deployments:
- name: SamplerManagement
autoscaling_config:
min_replicas: 1
max_replicas: 1
target_ongoing_requests: 16
ray_actor_options:
num_cpus: 0.1
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,18 @@

from twinkle.data_format import Message, Trajectory
from twinkle.template import Template
from twinkle_client import init_tinker_client
from twinkle import init_tinker_client

# Step 1: Initialize Tinker client
init_tinker_client()

from tinker import ServiceClient

# Step 2: Define the base model and connect to the server
base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
base_model = 'Qwen/Qwen3-4B'
service_client = ServiceClient(
base_url='http://www.modelscope.cn/twinkle',
api_key=os.environ.get('MODELSCOPE_TOKEN')
base_url='http://localhost:8000',
api_key='EMPTY-TOKEN'
)

# Step 3: Create a sampling client by loading weights from a saved checkpoint.
Expand Down
137 changes: 137 additions & 0 deletions cookbook/client/tinker/custom_service/self_cognition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Tinker-Compatible Client - Self-Cognition Training & Evaluation Example
#
# This script demonstrates two workflows using the Tinker-compatible client:
# 1. train(): Fine-tune a model on a self-cognition dataset so it learns
# a custom identity (name, author).
# 2. eval(): Load a trained checkpoint and sample from it to verify
# that the model has learned the custom identity.
# The server must be running first (see server.py and server_config.yaml).
import os
from tqdm import tqdm
from tinker import types
from twinkle import init_tinker_client
from twinkle.data_format import Message, Trajectory
from twinkle.template import Template
from twinkle.dataloader import DataLoader
from twinkle.dataset import Dataset, DatasetMeta
from twinkle.preprocessor import SelfCognitionProcessor
from twinkle.server.tinker.common import input_feature_to_datum

# Initialize the Tinker client before importing ServiceClient
init_tinker_client()

from tinker import ServiceClient

# The base model to fine-tune / evaluate
base_model = 'Qwen/Qwen3-4B'
base_url = 'http://localhost:8000'
api_key = 'EMPTY_API_KEY'


def train():
# Step 1: Prepare the dataset

# Load the self-cognition dataset from ModelScope (first 500 examples)
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))

# Apply the chat template matching the base model (max 256 tokens per sample)
dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256)

# Replace placeholder names with custom model/author identity
dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False)

# Tokenize and encode the dataset into model-ready input features
dataset.encode(batched=True, load_from_cache_file=False)

# Wrap the dataset into a DataLoader that yields batches of size 8
dataloader = DataLoader(dataset=dataset, batch_size=8)

# Step 2: Initialize the training client


service_client = ServiceClient(
base_url=base_url,
api_key=api_key
)

# Create a LoRA training client for the base model (rank=16 for the LoRA adapter)
training_client = service_client.create_lora_training_client(base_model=base_model, rank=16)

# Step 3: Run the training loop

for epoch in range(3):
print(f'Epoch {epoch}')
for step, batch in tqdm(enumerate(dataloader)):
# Convert each InputFeature into a Datum for the Tinker API
input_datum = [input_feature_to_datum(input_feature) for input_feature in batch]

# Send data to server: forward + backward pass (computes gradients)
fwdbwd_future = training_client.forward_backward(input_datum, 'cross_entropy')

# Optimizer step: update model weights with Adam
optim_future = training_client.optim_step(types.AdamParams(learning_rate=1e-4))

# Wait for both operations to complete
fwdbwd_result = fwdbwd_future.result()
optim_result = optim_future.result()

# Compute weighted average log-loss per token for monitoring
# logprobs = np.concatenate([output['logprobs'].tolist() for output in fwdbwd_result.loss_fn_outputs])
# weights = np.concatenate([example.loss_fn_inputs['weights'].tolist() for example in input_datum])
# print(f'Loss per token: {-np.dot(logprobs, weights) / weights.sum():.4f}')
print(f'Training Metrics: {optim_result}')

# Save a checkpoint after each epoch
save_future = training_client.save_state(f'twinkle-lora-{epoch}')
save_result = save_future.result()
print(f'Saved checkpoint to {save_result.path}')


def eval():
# Step 1: Load the trained LoRA checkpoint for inference

# Path to a previously saved LoRA checkpoint (twinkle:// URI)
weight_path = 'twinkle://20260212_174205-Qwen_Qwen2_5-7B-Instruct-51edc9ed/weights/twinkle-lora-2'

service_client = ServiceClient(base_url=base_url, api_key=os.environ.get('MODELSCOPE_TOKEN'))
sampling_client = service_client.create_sampling_client(model_path=weight_path, base_model=base_model)

# Step 2: Prepare the chat prompt

# Build a multi-turn conversation to test the model's self-cognition
template = Template(model_id=f'ms://{base_model}')

trajectory = Trajectory(
messages=[
Message(role='system', content='You are a helpful assistant'),
Message(role='user', content='你是谁?'),
]
)

input_feature = template.encode(trajectory, add_generation_prompt=True)

input_ids = input_feature['input_ids'].tolist()

# Step 3: Generate responses

prompt = types.ModelInput.from_ints(input_ids)
params = types.SamplingParams(
max_tokens=50, # Maximum tokens to generate
temperature=0.2, # Low temperature for more focused responses
stop=['\n'] # Stop at newline
)

# Sample 8 independent completions
print('Sampling...')
future = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=8)
result = future.result()

# Decode and print each response
print('Responses:')
for i, seq in enumerate(result.sequences):
print(f'{i}: {repr(template.decode(seq.tokens))}')


if __name__ == '__main__':
train() # Uncomment to run training
# eval() # Run evaluation / inference
Loading
Loading