modelscope · Yunnglin · Feb 28, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/README.md b/README.md
@@ -203,7 +203,7 @@ if __name__ == '__main__':
 import os
 from tqdm import tqdm
 from tinker import types
-from twinkle_client import init_tinker_client
+from twinkle import init_tinker_client
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor

diff --git a/README_ZH.md b/README_ZH.md
@@ -186,7 +186,7 @@ if __name__ == '__main__':
 import os
 from tqdm import tqdm
 from tinker import types
-from twinkle_client import init_tinker_client
+from twinkle import init_tinker_client
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor

diff --git a/cookbook/client/tinker/lora.py → ...book/client/tinker/custom_service/lora.py b/cookbook/client/tinker/lora.py → ...book/client/tinker/custom_service/lora.py
@@ -8,22 +8,23 @@
 
 # Step 1: Load environment variables from a .env file (e.g., API tokens)
 import dotenv
-
 dotenv.load_dotenv('.env')
 
-import os
-
 # Step 2: Initialize Tinker client before importing ServiceClient
-from twinkle_client import init_tinker_client
+from twinkle import init_tinker_client
 
 init_tinker_client()
 
 # Step 3: Use ServiceClient directly from tinker
 from tinker import ServiceClient
 
 service_client = ServiceClient(
-    base_url='http://www.modelscope.cn/twinkle',
-    api_key=os.environ.get('MODELSCOPE_TOKEN')
+	# BASE_URL can be a local server endpoint such as http://localhost:8000, or
+	# points to a previously deployed remote server, or
+	# modelscope server such as 'http://www.modelscope.cn/twinkle'
+    base_url='http://localhost:8000',
+	# API_KEY can be empty or a meaninful one according to sever configuration
+    api_key='EMPTY-TOKEN'
 )
 
 # Step 4: List models available on the server to verify the connection
@@ -40,10 +41,12 @@
 
 # You can resume from either:
 #   1. A twinkle path:  "twinkle://.../<run_id>/weights/<checkpoint_name>"
-#   2. A model id on hub: "<user>/<model_id>"
+#   2. A model id on ModelScope hub: "ms://<user>/<model_id>"
+#   3. A local path to a checkpoint directory
 # Example:
 # resume_path = "twinkle://20260131_170251-Qwen_Qwen2_5-0_5B-Instruct-7275126c/weights/pig-latin-lora-epoch-1"
-# resume_path = "AlexEz/20260205_163645-Qwen_Qwen2_5-7B-Instruct-385d5c17_pig-latin-lora-epoch-1"
+# resume_path = "ms://AlexEz/20260205_163645-Qwen_Qwen2_5-7B-Instruct-385d5c17_pig-latin-lora-epoch-1"
+# resume_path = "/path/to/local/checkpoint/directory"
 resume_path = ''
 
 print(f'Found {len(response.training_runs)} training runs')
@@ -58,7 +61,7 @@
 
 # Step 6: Create or resume a training client.
 # If resume_path is set, it restores both model weights and optimizer state.
-base_model = 'Qwen/Qwen2.5-7B-Instruct'
+base_model = 'Qwen/Qwen3-4B'
 if not resume_path:
     training_client = service_client.create_lora_training_client(base_model=base_model)
 else:
@@ -85,19 +88,7 @@
     {
         'input': 'pickle jar',
         'output': 'ickle-pay ar-jay'
-    },
-    {
-        'input': 'space exploration',
-        'output': 'ace-spay exploration-way'
-    },
-    {
-        'input': 'rubber duck',
-        'output': 'ubber-ray uck-day'
-    },
-    {
-        'input': 'coding wizard',
-        'output': 'oding-cay izard-way'
-    },
+    }
 ]
 
 from modelscope import AutoTokenizer
@@ -181,6 +172,7 @@ def process_example(example: dict, tokenizer) -> types.Datum:
 
 # Step 9: Publish the final checkpoint to ModelScope Hub.
 # NOTE: Requires a valid ModelScope token set as api_key when initializing the client.
-# The published model name will be: {run_id}_{checkpoint_name}
+# The model will be published under the owner of the supplied ModelScope token,
+# with model name formatted as: {run_id}_{checkpoint_name}
 rest_client.publish_checkpoint_from_tinker_path(save_result.path).result()
 print('Published checkpoint')
diff --git a/cookbook/client/tinker/megatron/server.py → .../tinker/custom_service/megatron/server.py b/cookbook/client/tinker/megatron/server.py → .../tinker/custom_service/megatron/server.py
diff --git a/...ent/tinker/megatron/server_config_7b.yaml → ...ustom_service/megatron/server_config.yaml b/...ent/tinker/megatron/server_config_7b.yaml → ...ustom_service/megatron/server_config.yaml
@@ -22,9 +22,9 @@ applications:
     import_path: server            # Python module to import
     args:
       server_config:
-        per_token_model_limit: 1      # Maximum number of models (adapters) per token (server-globally enforced)
+        per_token_model_limit: 3      # Maximum number of models (adapters) per token (server-globally enforced)
       supported_models:
-        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen3-4B
     deployments:
       - name: TinkerCompatServer
         autoscaling_config:
@@ -36,17 +36,17 @@ applications:
 
   # 2. Model Service (commented out) - Would host the base model for training.
   #    Uncomment and configure if you need a training model worker.
-  - name: models-Qwen2.5-7B-Instruct
-    route_prefix: /api/v1/model/Qwen/Qwen2.5-7B-Instruct
+  - name: models-Qwen3-4B
+    route_prefix: /api/v1/model/Qwen/Qwen3-4B
     import_path: model
     args:
       use_megatron: true
-      model_id: "ms://Qwen/Qwen2.5-7B-Instruct" # ModelScope model identifier
+      model_id: "ms://Qwen/Qwen3-4B" # ModelScope model identifier
       max_length: 10240
       nproc_per_node: 2                            # Number of GPU processes per node
       device_group:
         name: model
-        ranks: [0,1]                              # GPU rank indices
+        ranks: 2                              # GPU rank indices
         device_type: cuda
       device_mesh:
         device_type: cuda
@@ -58,11 +58,12 @@ applications:
       adapter_config:
         adapter_timeout: 30                        # Seconds before idle adapter unload
         adapter_max_lifetime: 36000                # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
+      max_loras: 1                                 # Maximum number of LoRA adapters per model
     deployments:
       - name: ModelManagement
         autoscaling_config:
-          min_replicas: 1
-          max_replicas: 1
+          min_replicas: 2
+          max_replicas: 2
           target_ongoing_requests: 16
         ray_actor_options:
           num_cpus: 0.1
@@ -72,36 +73,36 @@ applications:
 
   # 3. Sampler Service - Runs inference / sampling using vLLM engine
   #    Used for generating text from the model (e.g., evaluating LoRA results).
-  # - name: sampler-Qwen2.5-7B-Instruct
-  #   route_prefix: /api/v1/sampler/Qwen/Qwen2.5-7B-Instruct
-  #   import_path: sampler
-  #   args:
-  #     model_id: "ms://Qwen/Qwen2.5-7B-Instruct"   # ModelScope model identifier
-  #     nproc_per_node: 2               # Number of GPU processes per node
-  #     sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
-  #     engine_args:                    # vLLM engine-specific settings
-  #       max_model_len: 4096           # Maximum sequence length the engine supports
-  #       gpu_memory_utilization: 0.5   # Fraction of GPU memory to use (0.0-1.0)
-  #       enable_lora: true             # Allow loading LoRA adapters during inference
-  #       logprobs_mode: processed_logprobs # Logprobs mode for sampling results
-  #     device_group:                   # Logical device group for the sampler
-  #       name: sampler
-  #       ranks: [2]                    # GPU rank indices to use
-  #       device_type: cuda
-  #     device_mesh:
-  #       device_type: cuda
-  #       dp_size: 1
-  #     queue_config:
-  #       rps_limit: 100                             # Max requests per second
-  #       tps_limit: 100000                           # Max tokens per second
-  #   deployments:
-  #     - name: SamplerManagement
-  #       autoscaling_config:
-  #         min_replicas: 1
-  #         max_replicas: 1
-  #         target_ongoing_requests: 16
-  #       ray_actor_options:
-  #         num_cpus: 0.1
-  #         runtime_env:
-  #           env_vars:
-  #             TWINKLE_TRUST_REMOTE_CODE: "0"
+  - name: sampler-Qwen3-4B
+    route_prefix: /api/v1/sampler/Qwen/Qwen3-4B
+    import_path: sampler
+    args:
+      model_id: "ms://Qwen/Qwen3-4B"   # ModelScope model identifier
+      nproc_per_node: 2               # Number of GPU processes per node
+      sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
+      engine_args:                    # vLLM engine-specific settings
+        max_model_len: 4096           # Maximum sequence length the engine supports
+        gpu_memory_utilization: 0.5   # Fraction of GPU memory to use (0.0-1.0)
+        enable_lora: true             # Allow loading LoRA adapters during inference
+        logprobs_mode: processed_logprobs # Logprobs mode for sampling results
+      device_group:                   # Logical device group for the sampler
+        name: sampler
+        ranks: 1                    # Number of GPUs to use
+        device_type: cuda
+      device_mesh:
+        device_type: cuda
+        dp_size: 1
+      queue_config:
+        rps_limit: 100                             # Max requests per second
+        tps_limit: 100000                           # Max tokens per second
+    deployments:
+      - name: SamplerManagement
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 1
+          target_ongoing_requests: 16
+        ray_actor_options:
+          num_cpus: 0.1
+          runtime_env:
+            env_vars:
+              TWINKLE_TRUST_REMOTE_CODE: "0"
diff --git a/cookbook/client/tinker/sample.py → ...ok/client/tinker/custom_service/sample.py b/cookbook/client/tinker/sample.py → ...ok/client/tinker/custom_service/sample.py
@@ -9,18 +9,18 @@
 
 from twinkle.data_format import Message, Trajectory
 from twinkle.template import Template
-from twinkle_client import init_tinker_client
+from twinkle import init_tinker_client
 
 # Step 1: Initialize Tinker client
 init_tinker_client()
 
 from tinker import ServiceClient
 
 # Step 2: Define the base model and connect to the server
-base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507'
+base_model = 'Qwen/Qwen3-4B'
 service_client = ServiceClient(
-    base_url='http://www.modelscope.cn/twinkle',
-    api_key=os.environ.get('MODELSCOPE_TOKEN')
+    base_url='http://localhost:8000',
+    api_key='EMPTY-TOKEN'
 )
 
 # Step 3: Create a sampling client by loading weights from a saved checkpoint.

diff --git a/cookbook/client/tinker/custom_service/self_cognition.py b/cookbook/client/tinker/custom_service/self_cognition.py
@@ -0,0 +1,137 @@
+# Tinker-Compatible Client - Self-Cognition Training & Evaluation Example
+#
+# This script demonstrates two workflows using the Tinker-compatible client:
+#   1. train(): Fine-tune a model on a self-cognition dataset so it learns
+#      a custom identity (name, author).
+#   2. eval():  Load a trained checkpoint and sample from it to verify
+#      that the model has learned the custom identity.
+# The server must be running first (see server.py and server_config.yaml).
+import os
+from tqdm import tqdm
+from tinker import types
+from twinkle import init_tinker_client
+from twinkle.data_format import Message, Trajectory
+from twinkle.template import Template
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.preprocessor import SelfCognitionProcessor
+from twinkle.server.tinker.common import input_feature_to_datum
+
+# Initialize the Tinker client before importing ServiceClient
+init_tinker_client()
+
+from tinker import ServiceClient
+
+# The base model to fine-tune / evaluate
+base_model = 'Qwen/Qwen3-4B'
+base_url = 'http://localhost:8000'
+api_key = 'EMPTY_API_KEY'
+
+
+def train():
+    # Step 1: Prepare the dataset
+
+    # Load the self-cognition dataset from ModelScope (first 500 examples)
+    dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
+
+    # Apply the chat template matching the base model (max 256 tokens per sample)
+    dataset.set_template('Template', model_id=f'ms://{base_model}', max_length=256)
+
+    # Replace placeholder names with custom model/author identity
+    dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'), load_from_cache_file=False)
+
+    # Tokenize and encode the dataset into model-ready input features
+    dataset.encode(batched=True, load_from_cache_file=False)
+
+    # Wrap the dataset into a DataLoader that yields batches of size 8
+    dataloader = DataLoader(dataset=dataset, batch_size=8)
+
+    # Step 2: Initialize the training client
+
+
+    service_client = ServiceClient(
+        base_url=base_url,
+        api_key=api_key
+    )
+
+    # Create a LoRA training client for the base model (rank=16 for the LoRA adapter)
+    training_client = service_client.create_lora_training_client(base_model=base_model, rank=16)
+
+    # Step 3: Run the training loop
+
+    for epoch in range(3):
+        print(f'Epoch {epoch}')
+        for step, batch in tqdm(enumerate(dataloader)):
+            # Convert each InputFeature into a Datum for the Tinker API
+            input_datum = [input_feature_to_datum(input_feature) for input_feature in batch]
+
+            # Send data to server: forward + backward pass (computes gradients)
+            fwdbwd_future = training_client.forward_backward(input_datum, 'cross_entropy')
+
+            # Optimizer step: update model weights with Adam
+            optim_future = training_client.optim_step(types.AdamParams(learning_rate=1e-4))
+
+            # Wait for both operations to complete
+            fwdbwd_result = fwdbwd_future.result()
+            optim_result = optim_future.result()
+
+            # Compute weighted average log-loss per token for monitoring
+            # logprobs = np.concatenate([output['logprobs'].tolist() for output in fwdbwd_result.loss_fn_outputs])
+            # weights = np.concatenate([example.loss_fn_inputs['weights'].tolist() for example in input_datum])
+            # print(f'Loss per token: {-np.dot(logprobs, weights) / weights.sum():.4f}')
+            print(f'Training Metrics: {optim_result}')
+
+        # Save a checkpoint after each epoch
+        save_future = training_client.save_state(f'twinkle-lora-{epoch}')
+        save_result = save_future.result()
+        print(f'Saved checkpoint to {save_result.path}')
+
+
+def eval():
+    # Step 1: Load the trained LoRA checkpoint for inference
+
+    # Path to a previously saved LoRA checkpoint (twinkle:// URI)
+    weight_path = 'twinkle://20260212_174205-Qwen_Qwen2_5-7B-Instruct-51edc9ed/weights/twinkle-lora-2'
+
+    service_client = ServiceClient(base_url=base_url, api_key=os.environ.get('MODELSCOPE_TOKEN'))
+    sampling_client = service_client.create_sampling_client(model_path=weight_path, base_model=base_model)
+
+    # Step 2: Prepare the chat prompt
+
+    # Build a multi-turn conversation to test the model's self-cognition
+    template = Template(model_id=f'ms://{base_model}')
+
+    trajectory = Trajectory(
+        messages=[
+            Message(role='system', content='You are a helpful assistant'),
+            Message(role='user', content='你是谁？'),
+        ]
+    )
+
+    input_feature = template.encode(trajectory, add_generation_prompt=True)
+
+    input_ids = input_feature['input_ids'].tolist()
+
+    # Step 3: Generate responses
+
+    prompt = types.ModelInput.from_ints(input_ids)
+    params = types.SamplingParams(
+        max_tokens=50,  # Maximum tokens to generate
+        temperature=0.2,  # Low temperature for more focused responses
+        stop=['\n']  # Stop at newline
+    )
+
+    # Sample 8 independent completions
+    print('Sampling...')
+    future = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=8)
+    result = future.result()
+
+    # Decode and print each response
+    print('Responses:')
+    for i, seq in enumerate(result.sequences):
+        print(f'{i}: {repr(template.decode(seq.tokens))}')
+
+
+if __name__ == '__main__':
+    train()   # Uncomment to run training
+    # eval()      # Run evaluation / inference