Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of subtitles in long_inference_video function #30

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 40 additions & 43 deletions goldfish_lv.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def __init__(self, args: argparse.Namespace) -> None:
self.model, self.vis_processor = init_model(args)
self.original_llama_model,self.original_llama_tokenizer=self.load_original_llama_model()
# self.summary_instruction="Generate a description of this video .Pay close attention to the objects, actions, emotions portrayed in the video,providing a vivid description of key moments.Specify any visual cues or elements that stand out."
self.summary_instruction="I'm a blind person, please provide me with a detailed summary of the video content and try to be as descriptive as possible."
self.summary_instruction="I'm a blind person, please provide me with a detailed summary of the video content and try to be as descriptive as possible. The videos are created from Nexar's road camera"
def load_original_llama_model(self):
model_name="meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
Expand Down Expand Up @@ -389,7 +389,7 @@ def split_long_video_into_clips(self,video_path):
os.system(f"python split_long_video_in_parallel.py --video_path {video_path} --output_folder {tmp_save_path}")
videos_list = sorted(os.listdir(tmp_save_path))
return videos_list,tmp_save_path
def long_inference_video(self, videos_list,tmp_save_path,subtitle_paths, use_subtitles) -> Optional[str]:
def long_inference_video(self, videos_list: List[str], tmp_save_path: str, subtitle_paths: Optional[List[str]] = None, use_subtitles: bool = False) -> Optional[dict]:
save_long_videos_path = "new_workspace/clips_summary/demo"
os.makedirs(save_long_videos_path, exist_ok=True)
file_path = f'{save_long_videos_path}/{self.video_name}.json'
Expand All @@ -398,46 +398,46 @@ def long_inference_video(self, videos_list,tmp_save_path,subtitle_paths, use_sub
print("Clips inference already done")
with open(file_path, 'r') as file:
video_information = json.load(file)
else:
video_number = 0
batch_size = 2
batch_video_paths, batch_instructions ,batch_subtitles= [], [],[]
video_information = {}
video_captions = []
for i, video in tqdm(enumerate(videos_list), desc="Inference video clips", total=len(videos_list)):
clip_path = os.path.join(tmp_save_path, video)
batch_video_paths.append(clip_path)
# previous_caption = "You are analysing a one long video of mutiple clips and this is the summary from all previous clips :"+video_captions[-1]+"\n\n" if video_captions else ""
previous_caption=""
batch_instructions.append(self.summary_instruction)
return video_information

video_information = {}
video_captions = []
batch_video_paths, batch_instructions, batch_subtitles = [], [], []

for i, video in tqdm(enumerate(videos_list), desc="Inference video clips", total=len(videos_list)):
clip_path = os.path.join(tmp_save_path, video)
batch_video_paths.append(clip_path)
batch_instructions.append(self.summary_instruction)
if use_subtitles and subtitle_paths:
batch_subtitles.append(subtitle_paths[i])
# Process each batch
if len(batch_video_paths) % batch_size == 0 and i != 0:
batch_preds,videos_conversation=self.run_batch(batch_video_paths,batch_instructions, batch_subtitles,previous_caption)
for pred,subtitle in zip(batch_preds,videos_conversation):
video_number += 1
save_name=f"{video_number}".zfill(5)
video_information[f'caption__{save_name}'] = pred
video_information[f'subtitle__{save_name}'] = subtitle
video_captions.append(pred)
batch_video_paths, batch_instructions,batch_subtitles = [], [],[]

# Process any remaining videos in the last batch
if batch_video_paths:
batch_preds,videos_conversation=self.run_batch(batch_video_paths,batch_instructions, batch_subtitles,previous_caption)
for pred,subtitle in zip(batch_preds,videos_conversation):
video_number += 1
save_name=f"{video_number}".zfill(5)
else:
batch_subtitles.append(None) # Handling for no subtitle case

if len(batch_video_paths) % 2 == 0: # Assuming batch_size is 2
batch_preds, videos_conversation = self.run_batch(batch_video_paths, batch_instructions, batch_subtitles)
for pred, subtitle in zip(batch_preds, videos_conversation):
video_number = i + 1 # Better scope handling for video number
save_name = f"{video_number}".zfill(5)
video_information[f'caption__{save_name}'] = pred
video_information[f'subtitle__{save_name}'] = subtitle
video_captions.append(pred)

# summary = self.compine_summaries(preds)
# preds['summary'] = summary
video_information['summary'] ="summary"
with open(file_path, 'w') as file:
json.dump(video_information, file, indent=4)
print("Clips inference done")
video_captions.append(pred)
batch_video_paths, batch_instructions, batch_subtitles = [], [], []

# Process any remaining videos in the last batch
if batch_video_paths:
batch_preds, videos_conversation = self.run_batch(batch_video_paths, batch_instructions, batch_subtitles)
for pred, subtitle in zip(batch_preds, videos_conversation):
video_number = len(videos_list) - len(batch_video_paths) + len(batch_preds)
save_name = f"{video_number}".zfill(5)
video_information[f'caption__{save_name}'] = pred
video_information[f'subtitle__{save_name}'] = subtitle
video_captions.append(pred)

video_information['summary'] = "summary" # Example summary
with open(file_path, 'w') as file:
json.dump(video_information, file, indent=4)
print("Clips inference done")

return video_information

def compine_summaries(self, text: str, rag: str = False) -> str:
Expand Down Expand Up @@ -832,7 +832,4 @@ def run_images (self,prepared_images,prepared_instruction,return_embedding=False
# if len(contexts)>0:
# print(minigpt_lv.inference_RAG(questions,contexts))

# print("time for 18 clip",time.time()-t1)



# print("time for 18 clip",time.time()-t1)