diff --git a/goldfish_lv.py b/goldfish_lv.py index b7e717c..c956698 100644 --- a/goldfish_lv.py +++ b/goldfish_lv.py @@ -106,7 +106,7 @@ def __init__(self, args: argparse.Namespace) -> None: self.model, self.vis_processor = init_model(args) self.original_llama_model,self.original_llama_tokenizer=self.load_original_llama_model() # self.summary_instruction="Generate a description of this video .Pay close attention to the objects, actions, emotions portrayed in the video,providing a vivid description of key moments.Specify any visual cues or elements that stand out." - self.summary_instruction="I'm a blind person, please provide me with a detailed summary of the video content and try to be as descriptive as possible." + self.summary_instruction="I'm a blind person, please provide me with a detailed summary of the video content and try to be as descriptive as possible. The videos are created from Nexar's road camera" def load_original_llama_model(self): model_name="meta-llama/Llama-2-7b-chat-hf" tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -389,7 +389,7 @@ def split_long_video_into_clips(self,video_path): os.system(f"python split_long_video_in_parallel.py --video_path {video_path} --output_folder {tmp_save_path}") videos_list = sorted(os.listdir(tmp_save_path)) return videos_list,tmp_save_path - def long_inference_video(self, videos_list,tmp_save_path,subtitle_paths, use_subtitles) -> Optional[str]: + def long_inference_video(self, videos_list: List[str], tmp_save_path: str, subtitle_paths: Optional[List[str]] = None, use_subtitles: bool = False) -> Optional[dict]: save_long_videos_path = "new_workspace/clips_summary/demo" os.makedirs(save_long_videos_path, exist_ok=True) file_path = f'{save_long_videos_path}/{self.video_name}.json' @@ -398,46 +398,46 @@ def long_inference_video(self, videos_list,tmp_save_path,subtitle_paths, use_sub print("Clips inference already done") with open(file_path, 'r') as file: video_information = json.load(file) - else: - video_number = 0 - batch_size = 2 - batch_video_paths, batch_instructions ,batch_subtitles= [], [],[] - video_information = {} - video_captions = [] - for i, video in tqdm(enumerate(videos_list), desc="Inference video clips", total=len(videos_list)): - clip_path = os.path.join(tmp_save_path, video) - batch_video_paths.append(clip_path) - # previous_caption = "You are analysing a one long video of mutiple clips and this is the summary from all previous clips :"+video_captions[-1]+"\n\n" if video_captions else "" - previous_caption="" - batch_instructions.append(self.summary_instruction) + return video_information + + video_information = {} + video_captions = [] + batch_video_paths, batch_instructions, batch_subtitles = [], [], [] + + for i, video in tqdm(enumerate(videos_list), desc="Inference video clips", total=len(videos_list)): + clip_path = os.path.join(tmp_save_path, video) + batch_video_paths.append(clip_path) + batch_instructions.append(self.summary_instruction) + if use_subtitles and subtitle_paths: batch_subtitles.append(subtitle_paths[i]) - # Process each batch - if len(batch_video_paths) % batch_size == 0 and i != 0: - batch_preds,videos_conversation=self.run_batch(batch_video_paths,batch_instructions, batch_subtitles,previous_caption) - for pred,subtitle in zip(batch_preds,videos_conversation): - video_number += 1 - save_name=f"{video_number}".zfill(5) - video_information[f'caption__{save_name}'] = pred - video_information[f'subtitle__{save_name}'] = subtitle - video_captions.append(pred) - batch_video_paths, batch_instructions,batch_subtitles = [], [],[] - - # Process any remaining videos in the last batch - if batch_video_paths: - batch_preds,videos_conversation=self.run_batch(batch_video_paths,batch_instructions, batch_subtitles,previous_caption) - for pred,subtitle in zip(batch_preds,videos_conversation): - video_number += 1 - save_name=f"{video_number}".zfill(5) + else: + batch_subtitles.append(None) # Handling for no subtitle case + + if len(batch_video_paths) % 2 == 0: # Assuming batch_size is 2 + batch_preds, videos_conversation = self.run_batch(batch_video_paths, batch_instructions, batch_subtitles) + for pred, subtitle in zip(batch_preds, videos_conversation): + video_number = i + 1 # Better scope handling for video number + save_name = f"{video_number}".zfill(5) video_information[f'caption__{save_name}'] = pred video_information[f'subtitle__{save_name}'] = subtitle - video_captions.append(pred) - - # summary = self.compine_summaries(preds) - # preds['summary'] = summary - video_information['summary'] ="summary" - with open(file_path, 'w') as file: - json.dump(video_information, file, indent=4) - print("Clips inference done") + video_captions.append(pred) + batch_video_paths, batch_instructions, batch_subtitles = [], [], [] + + # Process any remaining videos in the last batch + if batch_video_paths: + batch_preds, videos_conversation = self.run_batch(batch_video_paths, batch_instructions, batch_subtitles) + for pred, subtitle in zip(batch_preds, videos_conversation): + video_number = len(videos_list) - len(batch_video_paths) + len(batch_preds) + save_name = f"{video_number}".zfill(5) + video_information[f'caption__{save_name}'] = pred + video_information[f'subtitle__{save_name}'] = subtitle + video_captions.append(pred) + + video_information['summary'] = "summary" # Example summary + with open(file_path, 'w') as file: + json.dump(video_information, file, indent=4) + print("Clips inference done") + return video_information def compine_summaries(self, text: str, rag: str = False) -> str: @@ -832,7 +832,4 @@ def run_images (self,prepared_images,prepared_instruction,return_embedding=False # if len(contexts)>0: # print(minigpt_lv.inference_RAG(questions,contexts)) -# print("time for 18 clip",time.time()-t1) - - - +# print("time for 18 clip",time.time()-t1) \ No newline at end of file