diff --git a/lightx2v/shot_runner/rs2v_infer.py b/lightx2v/shot_runner/rs2v_infer.py index 11d1ccd6..1b99f40b 100755 --- a/lightx2v/shot_runner/rs2v_infer.py +++ b/lightx2v/shot_runner/rs2v_infer.py @@ -155,8 +155,8 @@ def load_audio(audio_path, target_sr): clip_input_info.overlap_latent = gen_latents[:, -1:] if clip_input_info.return_result_tensor: - gen_video_list.append(video_seg.clone()) - cut_audio_list.append(audio_seg) + gen_video_list.append(video_seg.clone().cpu().float()) + cut_audio_list.append(audio_seg.cpu()) elif self.va_controller.recorder is not None: video_seg = torch.clamp(video_seg, -1, 1).to(torch.float).cpu() video_seg = vae_to_comfyui_image_inplace(video_seg)