From dea50df94eb0b4ad5110af821067b4e2e3dcf4e4 Mon Sep 17 00:00:00 2001 From: BoringCrypto Date: Sat, 6 May 2023 21:15:00 +0700 Subject: [PATCH 1/3] Renamed conda envs to match filenames --- environment-cpu.yml | 2 +- environment-cuda.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment-cpu.yml b/environment-cpu.yml index 2e896e2d..4b93fed6 100644 --- a/environment-cpu.yml +++ b/environment-cpu.yml @@ -1,4 +1,4 @@ -name: bark-infinity +name: bark-infinity-cpu channels: - defaults - pytorch diff --git a/environment-cuda.yml b/environment-cuda.yml index f6af6ce4..59c6c85f 100644 --- a/environment-cuda.yml +++ b/environment-cuda.yml @@ -1,4 +1,4 @@ -name: bark-infinity-oneclick +name: bark-infinity-cuda channels: - defaults - pytorch From 16a85a58affdbb38efa2bb86179cfb070a89b58f Mon Sep 17 00:00:00 2001 From: BoringCrypto Date: Sat, 6 May 2023 22:04:04 +0700 Subject: [PATCH 2/3] Added requirements --- environment-cpu.yml | 4 +++- environment-cuda.yml | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/environment-cpu.yml b/environment-cpu.yml index 4b93fed6..f8195225 100644 --- a/environment-cpu.yml +++ b/environment-cpu.yml @@ -35,4 +35,6 @@ dependencies: - pathvalidate - rich - gradio - - nltk \ No newline at end of file + - nltk + - encodec + - rich_argparse \ No newline at end of file diff --git a/environment-cuda.yml b/environment-cuda.yml index 59c6c85f..59631481 100644 --- a/environment-cuda.yml +++ b/environment-cuda.yml @@ -38,4 +38,6 @@ dependencies: - pathvalidate - rich - gradio - - nltk \ No newline at end of file + - nltk + - encodec + - rich_argparse \ No newline at end of file From 0011f8010975c9d551e63caa5232aa8411efcdbb Mon Sep 17 00:00:00 2001 From: BoringCrypto Date: Sat, 6 May 2023 22:04:40 +0700 Subject: [PATCH 3/3] pep8 auto formatting applied --- bark_infinity/__init__.py | 1 - bark_infinity/api.py | 307 ++++++++++--------- bark_infinity/bark_legacy/bark_perform.py | 144 +++++---- bark_infinity/bark_legacy/bark_speak.py | 56 ++-- bark_infinity/clonevoice.py | 24 +- bark_infinity/config.py | 250 ++++++++++------ bark_infinity/generation.py | 156 ++++++---- bark_infinity/model.py | 90 ++++-- bark_infinity/model_fine.py | 24 +- bark_perform.py | 21 +- bark_webui.py | 341 ++++++++++++---------- 11 files changed, 810 insertions(+), 604 deletions(-) diff --git a/bark_infinity/__init__.py b/bark_infinity/__init__.py index 6f3b4ba9..2a6ae99a 100644 --- a/bark_infinity/__init__.py +++ b/bark_infinity/__init__.py @@ -4,4 +4,3 @@ from .api import generate_audio_long, render_npz_samples, list_speakers from .config import logger, console, get_default_values, load_all_defaults, VALID_HISTORY_PROMPT_DIRS - diff --git a/bark_infinity/api.py b/bark_infinity/api.py index 2310c3e0..0bd8789d 100644 --- a/bark_infinity/api.py +++ b/bark_infinity/api.py @@ -6,7 +6,7 @@ from scipy.io.wavfile import write as write_wav -## ADDED +# ADDED import os import re import datetime @@ -25,6 +25,7 @@ from bark_infinity import text_processing + def text_to_semantic( text: str, history_prompt: Optional[Union[Dict, str]] = None, @@ -95,11 +96,11 @@ def semantic_to_waveform( def save_as_prompt(filepath, full_generation): - assert(filepath.endswith(".npz")) - assert(isinstance(full_generation, dict)) - assert("semantic_prompt" in full_generation) - assert("coarse_prompt" in full_generation) - assert("fine_prompt" in full_generation) + assert (filepath.endswith(".npz")) + assert (isinstance(full_generation, dict)) + assert ("semantic_prompt" in full_generation) + assert ("coarse_prompt" in full_generation) + assert ("fine_prompt" in full_generation) np.savez(filepath, **full_generation) @@ -144,12 +145,13 @@ def generate_audio( audio_arr = out return audio_arr -## ADDED BELOW +# ADDED BELOW + def process_history_prompt(**kwargs): user_history_prompt = kwargs.get('history_prompt', None) valid_directories_to_check = VALID_HISTORY_PROMPT_DIRS - + if user_history_prompt is None: return None @@ -166,27 +168,28 @@ def process_history_prompt(**kwargs): logger.error(f" >> Can't find speaker file at: {full_path}") else: for directory in valid_directories_to_check: - full_path_in_dir = os.path.join(directory, f"{file_name}{file_extension}") + full_path_in_dir = os.path.join( + directory, f"{file_name}{file_extension}") if os.path.exists(full_path_in_dir): return full_path_in_dir - logger.error(f" >>! Can't find speaker file: {full_path} in: {valid_directories_to_check}") + logger.error( + f" >>! Can't find speaker file: {full_path} in: {valid_directories_to_check}") return None -def log_params(log_filepath, **kwargs): +def log_params(log_filepath, **kwargs): from rich.console import Console file_console = Console(color_system=None) with file_console.capture() as capture: - kwargs['history_prompt'] = kwargs.get('history_prompt_string',None) + kwargs['history_prompt'] = kwargs.get('history_prompt_string', None) kwargs['history_prompt_string'] = None file_console.print(kwargs) str_output = capture.get() - log_filepath = generate_unique_filepath(log_filepath) with open(log_filepath, "wt") as log_file: log_file.write(str_output) @@ -194,18 +197,18 @@ def log_params(log_filepath, **kwargs): return -def determine_output_filename(special_one_off_path = None, **kwargs): - if special_one_off_path: +def determine_output_filename(special_one_off_path=None, **kwargs): + if special_one_off_path: return sanitize_filepath(special_one_off_path) # normally generate a filename - output_dir = kwargs.get('output_dir',None) - output_filename = kwargs.get('output_filename',None) - + output_dir = kwargs.get('output_dir', None) + output_filename = kwargs.get('output_filename', None) # TODO: Offer a config for long clips to show only the original starting prompt. I prefer seeing each clip seperately names for easy referencing myself. - text_prompt = kwargs.get('text_prompt',None) or kwargs.get('text',None) or '' - history_prompt = kwargs.get('history_prompt_string',None) or 'random' + text_prompt = kwargs.get( + 'text_prompt', None) or kwargs.get('text', None) or '' + history_prompt = kwargs.get('history_prompt_string', None) or 'random' text_prompt = text_prompt.strip() history_prompt = os.path.basename(history_prompt).replace('.npz', '') @@ -219,7 +222,7 @@ def determine_output_filename(special_one_off_path = None, **kwargs): hoarder_mode = kwargs.get('hoarder_mode', False) if hoarder_mode: segment_number = kwargs.get("segment_number") - if segment_number and kwargs.get("total_segments", 1) > 1: + if segment_number and kwargs.get("total_segments", 1) > 1: segment_number_text = f"{str(segment_number).zfill(3)}_" if output_filename: @@ -237,7 +240,7 @@ def determine_output_filename(special_one_off_path = None, **kwargs): token_probs_history_entropy_std = entropy_std(token_probs_history) extra_stats = f"ent-{token_probs_history_entropy:.2f}_perp-{token_probs_history_perplexity:.2f}_entstd-{token_probs_history_entropy_std:.2f}" """ - date_str = datetime.datetime.now().strftime("%y-%m%d-%H%M-%S") + date_str = datetime.datetime.now().strftime("%y-%m%d-%H%M-%S") truncated_text = text_prompt[:20].strip() base_output_filename = f"{truncated_text}-SPK-{history_prompt}" @@ -245,7 +248,6 @@ def determine_output_filename(special_one_off_path = None, **kwargs): if segment_number_text is not None: base_output_filename = f"{segment_number_text}{base_output_filename}" - base_output_filename = f"{base_output_filename}.wav" output_filepath = ( @@ -256,9 +258,9 @@ def determine_output_filename(special_one_off_path = None, **kwargs): return output_filepath -def write_one_segment(audio_arr = None, full_generation = None, **kwargs): +def write_one_segment(audio_arr=None, full_generation=None, **kwargs): filepath = determine_output_filename(**kwargs) - #print(f"Looks like filepath is {filepath} is okay?") + # print(f"Looks like filepath is {filepath} is okay?") if full_generation is not None: write_seg_npz(filepath, full_generation, **kwargs) if audio_arr is not None and kwargs.get("segment_number", 1) != "base_history": @@ -267,10 +269,7 @@ def write_one_segment(audio_arr = None, full_generation = None, **kwargs): hoarder_mode = kwargs.get('hoarder_mode', False) dry_run = kwargs.get('dry_run', False) if hoarder_mode and not dry_run: - log_params(f"{filepath}_info.txt",**kwargs) - - - + log_params(f"{filepath}_info.txt", **kwargs) def generate_unique_dirpath(dirpath): @@ -283,6 +282,7 @@ def generate_unique_dirpath(dirpath): counter += 1 return unique_dirpath + def generate_unique_filepath(filepath): unique_filename = sanitize_filepath(filepath) name, ext = os.path.splitext(filepath) @@ -292,9 +292,10 @@ def generate_unique_filepath(filepath): counter += 1 return unique_filename + def write_seg_npz(filepath, full_generation, **kwargs): - #logger.debug(kwargs) + # logger.debug(kwargs) if kwargs.get("segment_number", 1) == "base_history": filepath = f"{filepath}_initial_prompt.npz" @@ -302,34 +303,34 @@ def write_seg_npz(filepath, full_generation, **kwargs): if not kwargs.get('dry_run', False) and kwargs.get('always_save_speaker', True): filepath = generate_unique_filepath(filepath) - np.savez_compressed(filepath, semantic_prompt = full_generation["semantic_prompt"], coarse_prompt = full_generation["coarse_prompt"], fine_prompt = full_generation["fine_prompt"]) - - + np.savez_compressed(filepath, semantic_prompt=full_generation["semantic_prompt"], + coarse_prompt=full_generation["coarse_prompt"], fine_prompt=full_generation["fine_prompt"]) + logger.info(f" .npz saved to {filepath} {dry_text}") + def write_seg_wav(filepath, audio_arr, **kwargs): dry_run = kwargs.get('dry_run', False) dry_text = '(dry run)' if dry_run else '' - if dry_run is not True: + if dry_run is not True: filepath = generate_unique_filepath(filepath) write_audiofile(filepath, audio_arr) logger.info(f" .wav saved to {filepath} {dry_text}") - def write_audiofile(output_filepath, audio_arr): output_filepath = generate_unique_filepath(output_filepath) write_wav(output_filepath, SAMPLE_RATE, audio_arr) - #sample_rate = 24000 - #soundfile.write(output_filepath, audio_arr, sample_rate,format='WAV', subtype='PCM_16') + # sample_rate = 24000 + # soundfile.write(output_filepath, audio_arr, sample_rate,format='WAV', subtype='PCM_16') # print(f"[green] ") - def call_with_non_none_params(func, **kwargs): - non_none_params = {key: value for key, value in kwargs.items() if value is not None} + non_none_params = {key: value for key, + value in kwargs.items() if value is not None} return func(**non_none_params) @@ -361,23 +362,21 @@ def generate_audio_barki( silent = kwargs.get("silent", None) output_full = kwargs.get("output_full", None) - - seed = kwargs.get("seed",None) + seed = kwargs.get("seed", None) if seed is not None: set_seed(seed) - ## TODO seperate stage seeds + # TODO seperate stage seeds - ## Semantic Options + # Semantic Options semantic_temp = text_temp if kwargs.get("semantic_temp", None): semantic_temp = kwargs.get("semantic_temp") - semantic_seed = kwargs.get("semantic_seed",None) + semantic_seed = kwargs.get("semantic_seed", None) if semantic_seed is not None: set_seed(semantic_seed) - semantic_tokens = call_with_non_none_params( generate_text_semantic, text=text, @@ -386,22 +385,21 @@ def generate_audio_barki( top_k=kwargs.get("semantic_top_k", None), top_p=kwargs.get("semantic_top_p", None), silent=silent, - min_eos_p = kwargs.get("semantic_min_eos_p", None), - max_gen_duration_s = kwargs.get("semantic_max_gen_duration_s", None), - allow_early_stop = kwargs.get("semantic_allow_early_stop", True), + min_eos_p=kwargs.get("semantic_min_eos_p", None), + max_gen_duration_s=kwargs.get("semantic_max_gen_duration_s", None), + allow_early_stop=kwargs.get("semantic_allow_early_stop", True), use_kv_caching=kwargs.get("semantic_use_kv_caching", True), ) - - ## Coarse Options + # Coarse Options coarse_temp = waveform_temp if kwargs.get("coarse_temp", None): coarse_temp = kwargs.get("coarse_temp") - coarse_seed = kwargs.get("coarse_seed",None) + coarse_seed = kwargs.get("coarse_seed", None) if coarse_seed is not None: set_seed(coarse_seed) - + coarse_tokens = call_with_non_none_params( generate_coarse, x_semantic=semantic_tokens, @@ -417,7 +415,7 @@ def generate_audio_barki( fine_temp = kwargs.get("fine_temp", 0.5) - fine_seed = kwargs.get("fine_seed",None) + fine_seed = kwargs.get("fine_seed", None) if fine_seed is not None: set_seed(fine_seed) @@ -446,50 +444,45 @@ def generate_audio_barki( if output_full: return full_generation, audio_arr - - return audio_arr - + return audio_arr def generate_audio_long_from_gradio(**kwargs): - output_iterations = kwargs.get("output_iterations", 1) - text_prompt = kwargs.get("text_prompt") - full_generation_segments, audio_arr_segments, final_filename_will_be = [], [], None - for iteration in range(1,output_iterations + 1): - if output_iterations > 1: - print(f"\nIteration {iteration} of {output_iterations}.") - if iteration == 1: - print(" ", text_prompt) + output_iterations = kwargs.get("output_iterations", 1) + text_prompt = kwargs.get("text_prompt") + full_generation_segments, audio_arr_segments, final_filename_will_be = [], [], None + for iteration in range(1, output_iterations + 1): + if output_iterations > 1: + print(f"\nIteration {iteration} of {output_iterations}.") + if iteration == 1: + print(" ", text_prompt) - kwargs["current_iteration"] = iteration - full_generation_segments, audio_arr_segments, final_filename_will_be = generate_audio_long(**kwargs) + kwargs["current_iteration"] = iteration + full_generation_segments, audio_arr_segments, final_filename_will_be = generate_audio_long( + **kwargs) - return full_generation_segments, audio_arr_segments, final_filename_will_be + return full_generation_segments, audio_arr_segments, final_filename_will_be def generate_audio_long( **kwargs, ): - kwargs = load_all_defaults(**kwargs) logger.debug(locals()) - history_prompt = kwargs.get("history_prompt", None) silent = kwargs.get("silent", None) - + full_generation_segments = [] audio_arr_segments = [] - - stable_mode_interval = kwargs.get('stable_mode_interval', None) - if stable_mode_interval < 0: + if stable_mode_interval < 0: stable_mode_interval = 0 stable_mode_interval_counter = None @@ -500,17 +493,19 @@ def generate_audio_long( dry_run = kwargs.get('dry_run', None) # yanked for now, required too many mods to core Bark code - extra_confused_travolta_mode = kwargs.get('extra_confused_travolta_mode', None) + extra_confused_travolta_mode = kwargs.get( + 'extra_confused_travolta_mode', None) hoarder_mode = kwargs.get('hoarder_mode', None) - single_starting_seed = kwargs.get("single_starting_seed",None) + single_starting_seed = kwargs.get("single_starting_seed", None) if single_starting_seed is not None: set_seed(single_starting_seed) # the old way of doing this - split_each_text_prompt_by = kwargs.get("split_each_text_prompt_by",None) - split_each_text_prompt_by_value = kwargs.get("split_each_text_prompt_by_value",None) + split_each_text_prompt_by = kwargs.get("split_each_text_prompt_by", None) + split_each_text_prompt_by_value = kwargs.get( + "split_each_text_prompt_by_value", None) if split_each_text_prompt_by is not None and split_each_text_prompt_by_value is not None: audio_segments = chunk_up_text_prev(**kwargs) @@ -524,11 +519,14 @@ def generate_audio_long( history_prompt = process_history_prompt(**kwargs) if history_prompt is not None: base_history = np.load(history_prompt) - base_history = {key: base_history[key] for key in base_history.keys()} + base_history = {key: base_history[key] + for key in base_history.keys()} kwargs['history_prompt_string'] = history_prompt_string - history_prompt_for_next_segment = base_history # just start from a dict for consistency - else: - logger.error(f"Speaker {history_prompt} could not be found, looking in{VALID_HISTORY_PROMPT_DIRS}") + # just start from a dict for consistency + history_prompt_for_next_segment = base_history + else: + logger.error( + f"Speaker {history_prompt} could not be found, looking in{VALID_HISTORY_PROMPT_DIRS}") return None, None, None # way too many files, for hoarder_mode every sample is in own dir @@ -539,19 +537,16 @@ def generate_audio_long( output_dir = os.path.join(output_dir, output_dir_sub) kwargs['output_dir'] = output_dir - if hoarder_mode and kwargs.get("history_prompt_string", False): kwargs['segment_number'] = "base_history" - write_one_segment(audio_arr = None, full_generation = base_history, **kwargs) + write_one_segment( + audio_arr=None, full_generation=base_history, **kwargs) full_generation, audio_arr = (None, None) kwargs["output_full"] = True kwargs["total_segments"] = len(audio_segments) - - - for i, segment_text in enumerate(audio_segments): estimated_time = estimate_spoken_time(segment_text) kwargs["text_prompt"] = segment_text @@ -560,11 +555,10 @@ def generate_audio_long( timeest = f"[bold red]{estimated_time:.2f}[/bold red]" segment_number = i + 1 - console.print(f"--Segment {segment_number}/{len(audio_segments)}: est. {timeest}s") - #tqdm.write(f"--Segment {segment_number}/{len(audio_segments)}: est. {timeest}s") - #tqdm.set_postfix_str(f"--Segment {segment_number}/{len(audio_segments)}: est. {timeest}s") - - + console.print( + f"--Segment {segment_number}/{len(audio_segments)}: est. {timeest}s") + # tqdm.write(f"--Segment {segment_number}/{len(audio_segments)}: est. {timeest}s") + # tqdm.set_postfix_str(f"--Segment {segment_number}/{len(audio_segments)}: est. {timeest}s") if not silent: print(f"{segment_text}") @@ -574,23 +568,22 @@ def generate_audio_long( full_generation, audio_arr = [], [] else: kwargs['history_prompt'] = history_prompt_for_next_segment - full_generation, audio_arr = generate_audio_barki(text=segment_text, **kwargs) - + full_generation, audio_arr = generate_audio_barki( + text=segment_text, **kwargs) + # if we weren't given a history prompt, save first segment instead if base_history is None: base_history = full_generation - logger.debug(f"stable_mode_interval: {stable_mode_interval_counter} of {stable_mode_interval}") - - - #history_prompt_for_next_segment["semantic_prompt"] = np.hstack([base_history#["semantic_prompt"], full_generation["semantic_prompt"]]).astype(np.int64) + logger.debug( + f"stable_mode_interval: {stable_mode_interval_counter} of {stable_mode_interval}") + # history_prompt_for_next_segment["semantic_prompt"] = np.hstack([base_history#["semantic_prompt"], full_generation["semantic_prompt"]]).astype(np.int64) if stable_mode_interval == 0: history_prompt_for_next_segment = full_generation - - + elif stable_mode_interval == 1: history_prompt_for_next_segment = base_history @@ -599,43 +592,42 @@ def generate_audio_long( # reset to base history stable_mode_interval_counter = stable_mode_interval history_prompt_for_next_segment = base_history - logger.info(f"resetting to base history_prompt, again in {stable_mode_interval} chunks") + logger.info( + f"resetting to base history_prompt, again in {stable_mode_interval} chunks") else: stable_mode_interval_counter -= 1 - history_prompt_for_next_segment = full_generation + history_prompt_for_next_segment = full_generation else: - logger.error(f"stable_mode_interval is {stable_mode_interval} and something has gone wrong.") + logger.error( + f"stable_mode_interval is {stable_mode_interval} and something has gone wrong.") return None, None - # TODO coarse ratio adjust fix - - full_generation_segments.append(full_generation) audio_arr_segments.append(audio_arr) - add_silence_between_segments = kwargs.get("add_silence_between_segments", 0.0) + add_silence_between_segments = kwargs.get( + "add_silence_between_segments", 0.0) if add_silence_between_segments > 0.0: - silence = np.zeros(int(add_silence_between_segments * SAMPLE_RATE)) + silence = np.zeros( + int(add_silence_between_segments * SAMPLE_RATE)) audio_arr_segments.append(silence.copy()) - - - - kwargs['segment_number'] = "final" final_filename_will_be = determine_output_filename(**kwargs) dry_run = kwargs.get('dry_run', None) - if not dry_run: write_one_segment(audio_arr = np.concatenate(audio_arr_segments), full_generation = full_generation_segments[0], **kwargs) + if not dry_run: + write_one_segment(audio_arr=np.concatenate( + audio_arr_segments), full_generation=full_generation_segments[0], **kwargs) print(f"Saved to {final_filename_will_be}") - + # TODO remember you used as a filename return full_generation_segments, audio_arr_segments, final_filename_will_be -def play_superpack_track(superpack_filepath = None, one_random=True): +def play_superpack_track(superpack_filepath=None, one_random=True): try: npz_file = np.load(superpack_filepath) @@ -658,7 +650,6 @@ def render_npz_samples(npz_directory="bark_infinity/assets/prompts/", start_from print(f"Rendering samples for speakers in: {npz_directory}") npz_files = [f for f in os.listdir(npz_directory) if f.endswith(".npz")] - if start_from is None: start_from = "fine_prompt" @@ -674,10 +665,11 @@ def render_npz_samples(npz_directory="bark_infinity/assets/prompts/", start_from fine_tokens = history_prompt["fine_prompt"] if gen_minor_variants is None: - + if start_from == "pure_semantic": # this required my mod generate_text_semantic, need to pretend it's two prompts - semantic_tokens = generate_text_semantic(text=None, history_prompt = history_prompt) + semantic_tokens = generate_text_semantic( + text=None, history_prompt=history_prompt) coarse_tokens = generate_coarse(semantic_tokens) fine_tokens = generate_fine(coarse_tokens) @@ -687,71 +679,80 @@ def render_npz_samples(npz_directory="bark_infinity/assets/prompts/", start_from elif start_from == "coarse_prompt": fine_tokens = generate_fine(coarse_tokens) - + elif start_from == "fine_prompt": # just decode existing fine tokens pass - history_prompt_render_variant = {"semantic_prompt": semantic_tokens, "coarse_prompt": coarse_tokens, "fine_prompt": fine_tokens} - + history_prompt_render_variant = { + "semantic_prompt": semantic_tokens, "coarse_prompt": coarse_tokens, "fine_prompt": fine_tokens} - elif gen_minor_variants > 0: # gen_minor_variants quick and simple - print(f"Generating {gen_minor_variants} minor variants for {npz_file}") + elif gen_minor_variants > 0: # gen_minor_variants quick and simple + print( + f"Generating {gen_minor_variants} minor variants for {npz_file}") gen_minor_variants = gen_minor_variants or 1 for i in range(gen_minor_variants): temp_coarse = random.uniform(0.5, 0.9) top_k_coarse = None if random.random() < 1/3 else random.randint(50, 100) top_p_coarse = None if random.random() < 1/3 else random.uniform(0.8, 0.95) - max_coarse_history_options = [630, random.randint(500, 630), random.randint(60, 500)] + max_coarse_history_options = [ + 630, random.randint(500, 630), random.randint(60, 500)] max_coarse_history = random.choice(max_coarse_history_options) - coarse_tokens = generate_coarse(semantic_tokens, temp=temp_coarse, top_k=top_k_coarse, top_p=top_p_coarse, max_coarse_history=max_coarse_history) + coarse_tokens = generate_coarse( + semantic_tokens, temp=temp_coarse, top_k=top_k_coarse, top_p=top_p_coarse, max_coarse_history=max_coarse_history) temp_fine = random.uniform(0.3, 0.7) fine_tokens = generate_fine(coarse_tokens, temp=temp_fine) - history_prompt_render_variant = {"semantic_prompt": semantic_tokens, "coarse_prompt": coarse_tokens, "fine_prompt": fine_tokens} + history_prompt_render_variant = { + "semantic_prompt": semantic_tokens, "coarse_prompt": coarse_tokens, "fine_prompt": fine_tokens} try: audio_arr = codec_decode(fine_tokens) - base_output_filename = os.path.splitext(npz_file)[0] + f"_var_{i}.wav" - output_filepath = os.path.join(npz_directory, base_output_filename) + base_output_filename = os.path.splitext( + npz_file)[0] + f"_var_{i}.wav" + output_filepath = os.path.join( + npz_directory, base_output_filename) output_filepath = generate_unique_filepath(output_filepath) - print(f" Rendering minor variant voice audio for {npz_filepath} to {output_filepath}") + print( + f" Rendering minor variant voice audio for {npz_filepath} to {output_filepath}") write_seg_wav(output_filepath, audio_arr) - write_seg_npz(output_filepath, history_prompt_render_variant) + write_seg_npz(output_filepath, + history_prompt_render_variant) except: print(f" ") - if not compression_mode: try: audio_arr = codec_decode(fine_tokens) base_output_filename = os.path.splitext(npz_file)[0] + ".wav" - output_filepath = os.path.join(npz_directory, base_output_filename) + output_filepath = os.path.join( + npz_directory, base_output_filename) output_filepath = generate_unique_filepath(output_filepath) - print(f" Rendering audio for {npz_filepath} to {output_filepath}") + print( + f" Rendering audio for {npz_filepath} to {output_filepath}") write_seg_wav(output_filepath, audio_arr) if save_npz: - write_seg_npz(output_filepath, history_prompt_render_variant) + write_seg_npz(output_filepath, + history_prompt_render_variant) except: print(f" ") elif compression_mode: - just_record_it = {"semantic_prompt": None, "coarse_prompt": coarse_tokens, "fine_prompt": None} + just_record_it = {"semantic_prompt": None, + "coarse_prompt": coarse_tokens, "fine_prompt": None} compress_mode_data.append(just_record_it) - #compress_mode_data.append(history_prompt_render_variant) + # compress_mode_data.append(history_prompt_render_variant) if compression_mode: print(f"have {len(compress_mode_data)} samples") output_filepath = os.path.join(npz_directory, "superpack.npz") output_filepath = generate_unique_filepath(output_filepath) with open(f"{output_filepath}", 'wb') as f: - np.savez_compressed(f, **{f"dict_{i}": np.array([d]) for i, d in enumerate(compress_mode_data)}) - - - + np.savez_compressed( + f, **{f"dict_{i}": np.array([d]) for i, d in enumerate(compress_mode_data)}) def resize_semantic_history(semantic_history, weight, max_len=256): @@ -759,10 +760,10 @@ def resize_semantic_history(semantic_history, weight, max_len=256): new_len = int(max_len * weight) semantic_history = semantic_history.astype(np.int64) - # Trim + # Trim if len(semantic_history) > new_len: semantic_history = semantic_history[-new_len:] - # Pad + # Pad else: semantic_history = np.pad( semantic_history, @@ -774,7 +775,6 @@ def resize_semantic_history(semantic_history, weight, max_len=256): return semantic_history - def estimate_spoken_time(text, wpm=150, threshold=15): text_without_brackets = re.sub(r'\[.*?\]', '', text) @@ -784,7 +784,6 @@ def estimate_spoken_time(text, wpm=150, threshold=15): return time_in_seconds - def chunk_up_text(**kwargs): text_prompt = kwargs['text_prompt'] @@ -792,8 +791,8 @@ def chunk_up_text(**kwargs): split_character_max_length = kwargs['split_character_max_length'] silent = kwargs.get('silent') - audio_segments = text_processing.split_general_purpose(text_prompt, split_character_goal_length=split_character_goal_length, split_character_max_length=split_character_max_length) - + audio_segments = text_processing.split_general_purpose( + text_prompt, split_character_goal_length=split_character_goal_length, split_character_max_length=split_character_max_length) split_desc = f"Splitting long text aiming for {split_character_goal_length} chars max {split_character_max_length}" @@ -803,7 +802,6 @@ def chunk_up_text(**kwargs): return audio_segments - def chunk_up_text_prev(**kwargs): text_prompt = kwargs['text_prompt'] @@ -811,7 +809,8 @@ def chunk_up_text_prev(**kwargs): split_by_value = kwargs['split_each_text_prompt_by_value'] silent = kwargs.get('silent') - audio_segments = text_processing.split_text(text_prompt, split_by, split_by_value) + audio_segments = text_processing.split_text( + text_prompt, split_by, split_by_value) if split_by == 'phrase': split_desc = f"Splitting long text by *{split_by}* (min_duration=8, max_duration=18, words_per_second=2.3)" @@ -824,7 +823,6 @@ def chunk_up_text_prev(**kwargs): return audio_segments - def print_chunks_table(chunks: list, left_column_header: str = "Words", right_column_header: str = "Segment Text", **kwargs): current_iteration = str( @@ -838,13 +836,12 @@ def print_chunks_table(chunks: list, left_column_header: str = "Words", right_co i = 1 for chunk in chunks: timeest = f"{estimate_spoken_time(chunk):.2f} secs" - table.add_row(str(i), f"{str(len(chunk.split()))}", f"{timeest}", chunk) + table.add_row( + str(i), f"{str(len(chunk.split()))}", f"{timeest}", chunk) i += 1 console.print(table) - - LANG_CODE_DICT = {code: lang for lang, code in SUPPORTED_LANGS} @@ -857,12 +854,14 @@ def gather_speakers(directory): if filename.endswith('.npz'): match = re.match(r"^([a-z]{2})_.*", filename) if match and match.group(1) in LANG_CODE_DICT: - speakers[match.group(1)].append(os.path.join(root, filename)) + speakers[match.group(1)].append( + os.path.join(root, filename)) else: unsupported_files.append(os.path.join(root, filename)) return speakers, unsupported_files + def list_speakers(): all_speakers = defaultdict(list) all_unsupported_files = [] @@ -888,7 +887,3 @@ def print_speakers(speakers, unsupported_files): print("Other:") for file in unsupported_files: print(" " + file) - - - - diff --git a/bark_infinity/bark_legacy/bark_perform.py b/bark_infinity/bark_legacy/bark_perform.py index eec88480..976fc4f7 100644 --- a/bark_infinity/bark_legacy/bark_perform.py +++ b/bark_infinity/bark_legacy/bark_perform.py @@ -10,7 +10,6 @@ FileData = namedtuple("FileData", ["filename", "name", "desc"]) - SUPPORTED_LANGS = [ ("English", "en"), ("German", "de"), @@ -28,44 +27,52 @@ ] - def read_npz_files(directory): return [f for f in os.listdir(directory) if f.endswith(".npz")] + def extract_name_and_desc(filepath): with np.load(filepath) as data: name = data.get('name', '') desc = data.get('desc', '') return name, desc + def categorize_files(files, directory): categorized_files = defaultdict(list) lang_dict = {code: lang for lang, code in SUPPORTED_LANGS} - + for file in files: name, desc = extract_name_and_desc(os.path.join(directory, file)) match = re.match(r"([a-z]{2}|\w+)_", file) if match: prefix = match.group(1) if prefix in lang_dict: - categorized_files[lang_dict[prefix]].append(FileData(file, name, desc)) + categorized_files[lang_dict[prefix]].append( + FileData(file, name, desc)) else: - categorized_files[prefix.capitalize()].append(FileData(file, name, desc)) + categorized_files[prefix.capitalize()].append( + FileData(file, name, desc)) else: categorized_files["Other"].append(FileData(file, name, desc)) return categorized_files # this is a mess but whatever + + def print_speakers_list(categorized_files): print("Available history prompts:") for category, files in categorized_files.items(): - sorted_files = sorted(files, key=lambda x: (re.search(r"_\w+(_\d+)?\.npz$", x.filename) and re.search(r"_\w+(_\d+)?\.npz$", x.filename).group()[:-4], x.filename)) + sorted_files = sorted(files, key=lambda x: (re.search(r"_\w+(_\d+)?\.npz$", x.filename) + and re.search(r"_\w+(_\d+)?\.npz$", x.filename).group()[:-4], x.filename)) print(f"\n {category}:") for file_data in sorted_files: name_display = f' "{file_data.name}"' if file_data.name else '' desc_display = f'{file_data.desc}' if file_data.desc else '' - print(f" {file_data.filename[:-4]} {name_display} {desc_display}") + print( + f" {file_data.filename[:-4]} {name_display} {desc_display}") + CUR_PATH = os.path.dirname(os.path.abspath(__file__)) history_prompt_dir = os.path.join(CUR_PATH, "bark", "assets", "prompts") @@ -75,15 +82,14 @@ def print_speakers_list(categorized_files): ALLOWED_PROMPTS = {file[:-4] for file in npz_files} - def estimate_spoken_time(text, wpm=150, time_limit=14): # Remove text within square brackets text_without_brackets = re.sub(r'\[.*?\]', '', text) - + words = text_without_brackets.split() word_count = len(words) time_in_seconds = (word_count / wpm) * 60 - + if time_in_seconds > time_limit: return True, time_in_seconds else: @@ -91,22 +97,28 @@ def estimate_spoken_time(text, wpm=150, time_limit=14): def save_npz_file(filepath, x_semantic_continued, coarse_prompt, fine_prompt, output_dir=None): - np.savez(filepath, semantic_prompt=x_semantic_continued, coarse_prompt=coarse_prompt, fine_prompt=fine_prompt) + np.savez(filepath, semantic_prompt=x_semantic_continued, + coarse_prompt=coarse_prompt, fine_prompt=fine_prompt) print(f"speaker file for this clip saved to {filepath}") + def split_text(text, split_words=0, split_lines=0): if split_words > 0: words = text.split() - chunks = [' '.join(words[i:i + split_words]) for i in range(0, len(words), split_words)] + chunks = [' '.join(words[i:i + split_words]) + for i in range(0, len(words), split_words)] elif split_lines > 0: lines = [line for line in text.split('\n') if line.strip()] - chunks = ['\n'.join(lines[i:i + split_lines]) for i in range(0, len(lines), split_lines)] + chunks = ['\n'.join(lines[i:i + split_lines]) + for i in range(0, len(lines), split_lines)] else: chunks = [text] return chunks + def save_audio_to_file(filepath, audio_array, sample_rate=24000, format='WAV', subtype='PCM_16', output_dir=None): - sf.write(filepath, audio_array, sample_rate, format=format, subtype=subtype) + sf.write(filepath, audio_array, sample_rate, + format=format, subtype=subtype) print(f"Saved audio to {filepath}") @@ -127,9 +139,9 @@ def generate_unique_filename(base_filename): print(f" Using speaker: {history_prompt}") else: print(f" No speaker. Randomly generating a speaker.") - + text_chunks = split_text(text_prompt, split_by_words, split_by_lines) - + base = None npzbase = None audio_arr_chunks = [] @@ -138,11 +150,14 @@ def generate_unique_filename(base_filename): for i, chunk in enumerate(text_chunks): print(f"Processing chunk {i + 1}/{len(text_chunks)}: {chunk}") longer_than_14_seconds, estimated_time = estimate_spoken_time(chunk) - print(f"Current text chunk ballpark estimate: {estimated_time:.2f} seconds.") + print( + f"Current text chunk ballpark estimate: {estimated_time:.2f} seconds.") if longer_than_14_seconds: - print(f"Text Prompt could be too long, might want to try a shorter one or try splitting tighter.") + print( + f"Text Prompt could be too long, might want to try a shorter one or try splitting tighter.") - audio_array, x = generate_audio(chunk, history_prompt, text_temp=text_temp, waveform_temp=waveform_temp, base=base, confused_travolta_mode=confused_travolta_mode) + audio_array, x = generate_audio(chunk, history_prompt, text_temp=text_temp, + waveform_temp=waveform_temp, base=base, confused_travolta_mode=confused_travolta_mode) if saveit is True and npzbase is None: npzbase = x if stable_mode: @@ -151,12 +166,13 @@ def generate_unique_filename(base_filename): base = x history_prompt = None audio_arr_chunks.append(audio_array) - + concatenated_audio_arr = np.concatenate(audio_arr_chunks) if not filename: date_str = datetime.datetime.now().strftime("%Y-%m-%d-%H") - truncated_text = text_prompt.replace("WOMAN:", "").replace("MAN:", "")[:15].strip().replace(" ", "_") + truncated_text = text_prompt.replace("WOMAN:", "").replace("MAN:", "")[ + :15].strip().replace(" ", "_") filename = f"{truncated_text}-history_prompt-{orig_history_prompt}-text_temp-{text_temp}-waveform_temp-{waveform_temp}-{date_str}.wav" filename = generate_unique_filename(filename) @@ -174,14 +190,15 @@ def generate_unique_filename(base_filename): i += 1 if saveit is True: - save_npz_file(f'{filepath}.npz', npzbase[0], npzbase[1], npzbase[2], output_dir=output_dir) - - save_audio_to_file(filepath, concatenated_audio_arr, SAMPLE_RATE, output_dir=output_dir) + save_npz_file( + f'{filepath}.npz', npzbase[0], npzbase[1], npzbase[2], output_dir=output_dir) + save_audio_to_file(filepath, concatenated_audio_arr, + SAMPLE_RATE, output_dir=output_dir) # If there's no text_prompt passed on the command line, process this list instead. -# If you use an entir song, make sure you set --split_by_lines. +# If you use an entir song, make sure you set --split_by_lines. text_prompts = [] text_prompt = """ @@ -204,7 +221,7 @@ def generate_unique_filename(base_filename): def main(args): - + if args.list_speakers: print_speakers_list(categorized_files) else: @@ -217,15 +234,18 @@ def main(args): else: text_prompts_to_process = [f.read()] - text_prompts_to_process = [prompt for prompt in text_prompts_to_process if prompt.strip()] + text_prompts_to_process = [ + prompt for prompt in text_prompts_to_process if prompt.strip()] print(f"Processing prompts from file: {args.prompt_file}") - print(f"Number of prompts after splitting: {len(text_prompts_to_process)}") + print( + f"Number of prompts after splitting: {len(text_prompts_to_process)}") else: - print("No text prompt provided. Using the prompts defined in this python file instead.") + print( + "No text prompt provided. Using the prompts defined in this python file instead.") text_prompts_to_process = text_prompts - if args.history_prompt: + if args.history_prompt: history_prompt = args.history_prompt else: history_prompt = None @@ -237,7 +257,7 @@ def main(args): output_dir = args.output_dir if args.output_dir else "bark_samples" print("Loading Bark models...") - + if args.use_smaller_models: print("Using smaller models.") preload_models(use_smaller_models=True) @@ -247,17 +267,20 @@ def main(args): print("Models loaded.") for idx, prompt in enumerate(text_prompts_to_process, start=1): - print(f"Processing prompt {idx} of {len(text_prompts_to_process)}:") - + print( + f"Processing prompt {idx} of {len(text_prompts_to_process)}:") + split_by_words = args.split_by_words if args.split_by_words else 0 split_by_lines = args.split_by_lines if args.split_by_lines else 0 - if args.iterations > 1: + if args.iterations > 1: for iteration in range(1, args.iterations + 1): print(f"Iteration {iteration} of {args.iterations}.") - gen_and_save_audio(prompt, history_prompt, text_temp, waveform_temp, filename, output_dir, split_by_words, split_by_lines, stable_mode, confused_travolta_mode, iteration=iteration) + gen_and_save_audio(prompt, history_prompt, text_temp, waveform_temp, filename, output_dir, + split_by_words, split_by_lines, stable_mode, confused_travolta_mode, iteration=iteration) else: - gen_and_save_audio(prompt, history_prompt, text_temp, waveform_temp, filename, output_dir, split_by_words, split_by_lines, stable_mode, confused_travolta_mode) + gen_and_save_audio(prompt, history_prompt, text_temp, waveform_temp, filename, + output_dir, split_by_words, split_by_lines, stable_mode, confused_travolta_mode) if __name__ == "__main__": @@ -291,22 +314,37 @@ def main(args): You'll generally get better results if you manually split your text, which I neglected to provide an easy way to do (seperate token?). """, formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument("--text_prompt", help="Text prompt. If not provided, a set of default prompts will be used defined in this file.") - parser.add_argument("--history_prompt", default=None, help="Optional. Choose a speaker from the list of languages: . Use --list_speakers to see all available options.") - parser.add_argument("--text_temp", type=float, help="Text temperature. Default is 0.7.") - parser.add_argument("--waveform_temp", type=float, help="Waveform temperature. Default is 0.7.") - parser.add_argument("--filename", help="Output filename. If not provided, a unique filename will be generated based on the text prompt and other parameters.") - parser.add_argument("--output_dir", help="Output directory. Default is 'bark_samples'.") - parser.add_argument("--list_speakers", action="store_true", help="List all preset speaker options instead of generating audio.") - parser.add_argument("--use_smaller_models", action="store_true", help="Use for GPUS with less than 10GB of memory, or for more speed.") - parser.add_argument("--iterations", type=int, default=1, help="Number of iterations. Default is 1.") - parser.add_argument("--split_by_words", type=int, default=0, help="Breaks text_prompt into <14 second audio clips every x words") - parser.add_argument("--split_by_lines", type=int, default=0, help="Breaks text_prompt into <14 second audio clips every x lines") - parser.add_argument("--stable_mode", action="store_true", help="Choppier and not as natural sounding, but much more stable for very long audio files.") - parser.add_argument("--confused_travolta_mode", default=False, action="store_true", help="Just for fun. Try it and you'll understand.") - - parser.add_argument("--prompt_file", help="Optional. The path to a file containing the text prompt. Overrides the --text_prompt option if provided.") - parser.add_argument("--prompt_file_separator", help="Optional. The separator used to split the content of the prompt_file into multiple text prompts.") - + parser.add_argument( + "--text_prompt", help="Text prompt. If not provided, a set of default prompts will be used defined in this file.") + parser.add_argument("--history_prompt", default=None, + help="Optional. Choose a speaker from the list of languages: . Use --list_speakers to see all available options.") + parser.add_argument("--text_temp", type=float, + help="Text temperature. Default is 0.7.") + parser.add_argument("--waveform_temp", type=float, + help="Waveform temperature. Default is 0.7.") + parser.add_argument( + "--filename", help="Output filename. If not provided, a unique filename will be generated based on the text prompt and other parameters.") + parser.add_argument( + "--output_dir", help="Output directory. Default is 'bark_samples'.") + parser.add_argument("--list_speakers", action="store_true", + help="List all preset speaker options instead of generating audio.") + parser.add_argument("--use_smaller_models", action="store_true", + help="Use for GPUS with less than 10GB of memory, or for more speed.") + parser.add_argument("--iterations", type=int, default=1, + help="Number of iterations. Default is 1.") + parser.add_argument("--split_by_words", type=int, default=0, + help="Breaks text_prompt into <14 second audio clips every x words") + parser.add_argument("--split_by_lines", type=int, default=0, + help="Breaks text_prompt into <14 second audio clips every x lines") + parser.add_argument("--stable_mode", action="store_true", + help="Choppier and not as natural sounding, but much more stable for very long audio files.") + parser.add_argument("--confused_travolta_mode", default=False, + action="store_true", help="Just for fun. Try it and you'll understand.") + + parser.add_argument( + "--prompt_file", help="Optional. The path to a file containing the text prompt. Overrides the --text_prompt option if provided.") + parser.add_argument("--prompt_file_separator", + help="Optional. The separator used to split the content of the prompt_file into multiple text prompts.") + args = parser.parse_args() main(args) diff --git a/bark_infinity/bark_legacy/bark_speak.py b/bark_infinity/bark_legacy/bark_speak.py index 3a084836..3a1f7755 100644 --- a/bark_infinity/bark_legacy/bark_speak.py +++ b/bark_infinity/bark_legacy/bark_speak.py @@ -32,19 +32,21 @@ for n in range(10): ALLOWED_PROMPTS.add(f"speaker_{n}") + def estimate_spoken_time(text, wpm=150, time_limit=14): # Remove text within square brackets text_without_brackets = re.sub(r'\[.*?\]', '', text) - + words = text_without_brackets.split() word_count = len(words) time_in_seconds = (word_count / wpm) * 60 - + if time_in_seconds > time_limit: return True, time_in_seconds else: return False, time_in_seconds + def save_audio_to_file(filename, audio_array, sample_rate=24000, format='WAV', subtype='PCM_16', output_dir=None): # Create output directory if it doesn't exist @@ -60,7 +62,8 @@ def save_audio_to_file(filename, audio_array, sample_rate=24000, format='WAV', s filepath = f"{name}_{i}{ext}" i += 1 - sf.write(filepath, audio_array, sample_rate, format=format, subtype=subtype) + sf.write(filepath, audio_array, sample_rate, + format=format, subtype=subtype) print(f"Saved audio to {filepath}") @@ -81,22 +84,22 @@ def generate_unique_filename(base_filename): print(f"Generating: {text_prompt}") if args.history_prompt: print(f"Using speaker: {history_prompt}") - + else: print(f"No speaker. Randomly generating a speaker.") - + audio_array = generate_audio(text_prompt, history_prompt, text_temp=text_temp, - waveform_temp=waveform_temp) + waveform_temp=waveform_temp) if not filename: date_str = datetime.datetime.now().strftime("%Y-%m-%d-%H") - truncated_text = text_prompt.replace("WOMAN:", "").replace("MAN:", "")[:15].strip().replace(" ", "_") + truncated_text = text_prompt.replace("WOMAN:", "").replace("MAN:", "")[ + :15].strip().replace(" ", "_") filename = f"{truncated_text}-history_prompt-{history_prompt}-text_temp-{text_temp}-waveform_temp-{waveform_temp}-{date_str}.wav" filename = generate_unique_filename(filename) - save_audio_to_file(filename, audio_array, SAMPLE_RATE, output_dir=output_dir) - - + save_audio_to_file(filename, audio_array, + SAMPLE_RATE, output_dir=output_dir) def print_speakers_list(): @@ -110,7 +113,6 @@ def print_speakers_list(): print(f"\n {language}({lang_code}):\n{speakers}") - # If there's no text_prompt passed on the command line, process this list instead. text_prompts = [] @@ -134,7 +136,7 @@ def main(args): else: print("No text prompt provided. Using default prompts defined in this file.") text_prompts_to_process = text_prompts - if args.history_prompt: + if args.history_prompt: history_prompt = args.history_prompt else: history_prompt = None @@ -144,7 +146,7 @@ def main(args): output_dir = args.output_dir if args.output_dir else "bark_samples" print("Loading Bark models...") - + if args.use_smaller_models: print("Using smaller models.") preload_models(use_smaller_models=True) @@ -154,7 +156,9 @@ def main(args): print("Models loaded.") for prompt in text_prompts_to_process: - gen_and_save_audio(prompt, history_prompt, text_temp, waveform_temp, filename, output_dir) + gen_and_save_audio(prompt, history_prompt, text_temp, + waveform_temp, filename, output_dir) + if __name__ == "__main__": parser = argparse.ArgumentParser(description=""" @@ -162,14 +166,22 @@ def main(args): install this first: pip install soundfile Example: python bark_speak.py --text_prompt "It is a mistake to think you can solve any major problems just with potatoes." --history_prompt en_speaker_3 """, formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument("--text_prompt", help="Text prompt. If not provided, a set of default prompts will be used defined in this file.") - parser.add_argument("--history_prompt", help="Optional. Choose a speaker from the list of languages: " + ", ".join([lang[0] for lang in SUPPORTED_LANGS]) + ". Use --list_speakers to see all available options.") - parser.add_argument("--text_temp", type=float, help="Text temperature. Default is 0.7.") - parser.add_argument("--waveform_temp", type=float, help="Waveform temperature. Default is 0.7.") - parser.add_argument("--filename", help="Output filename. If not provided, a unique filename will be generated based on the text prompt and other parameters.") - parser.add_argument("--output_dir", help="Output directory. Default is 'bark_samples'.") - parser.add_argument("--list_speakers", action="store_true", help="List all preset speaker options instead of generating audio.") - parser.add_argument("--use_smaller_models", action="store_true", help="Use for GPUS with less than 10GB of memory, or for more speed.") + parser.add_argument( + "--text_prompt", help="Text prompt. If not provided, a set of default prompts will be used defined in this file.") + parser.add_argument("--history_prompt", help="Optional. Choose a speaker from the list of languages: " + + ", ".join([lang[0] for lang in SUPPORTED_LANGS]) + ". Use --list_speakers to see all available options.") + parser.add_argument("--text_temp", type=float, + help="Text temperature. Default is 0.7.") + parser.add_argument("--waveform_temp", type=float, + help="Waveform temperature. Default is 0.7.") + parser.add_argument( + "--filename", help="Output filename. If not provided, a unique filename will be generated based on the text prompt and other parameters.") + parser.add_argument( + "--output_dir", help="Output directory. Default is 'bark_samples'.") + parser.add_argument("--list_speakers", action="store_true", + help="List all preset speaker options instead of generating audio.") + parser.add_argument("--use_smaller_models", action="store_true", + help="Use for GPUS with less than 10GB of memory, or for more speed.") args = parser.parse_args() main(args) diff --git a/bark_infinity/clonevoice.py b/bark_infinity/clonevoice.py index 26ed1e0d..0f777922 100644 --- a/bark_infinity/clonevoice.py +++ b/bark_infinity/clonevoice.py @@ -9,11 +9,13 @@ import gradio import numpy as np -# This file is the work of https://github.com/C0untFloyd/bark-gui/blob/main/webui.py +# This file is the work of https://github.com/C0untFloyd/bark-gui/blob/main/webui.py + + def clone_voice(audio_filepath, text, dest_filename, progress=gradio.Progress(track_tqdm=True)): if len(text) < 1: raise gradio.Error('No transcription text entered!') - + generation.OFFLOAD_CPU = False use_gpu = not os.environ.get("BARK_FORCE_CPU", False) @@ -32,29 +34,31 @@ def clone_voice(audio_filepath, text, dest_filename, progress=gradio.Progress(tr # Extract discrete codes from EnCodec with torch.no_grad(): encoded_frames = model.encode(wav) - codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T] + codes = torch.cat([encoded[0] for encoded in encoded_frames], + dim=-1).squeeze() # [n_q, T] # get seconds of audio seconds = wav.shape[-1] / model.sample_rate # generate semantic tokens - codes_copy = codes.cpu().numpy().copy() - dir_path = api.generate_unique_dirpath(f"voice_clone_samples/{dest_filename}_clones/") + dir_path = api.generate_unique_dirpath( + f"voice_clone_samples/{dest_filename}_clones/") max_gen_duration_s = seconds - for semantic_min_eos_p in [0.05,0.2]: + for semantic_min_eos_p in [0.05, 0.2]: for temp in [0.6, 0.7, 0.8]: - semantic_tokens = generation.generate_text_semantic(text, max_gen_duration_s=max_gen_duration_s, top_k=50, min_eos_p= semantic_min_eos_p, top_p=.95, temp=temp) + semantic_tokens = generation.generate_text_semantic( + text, max_gen_duration_s=max_gen_duration_s, top_k=50, min_eos_p=semantic_min_eos_p, top_p=.95, temp=temp) # move codes to cpu - #codes = codes.cpu().numpy() - + # codes = codes.cpu().numpy() os.makedirs(dir_path, exist_ok=True) output_path = f"{dir_path}/{dest_filename}.npz" output_path = api.generate_unique_filepath(output_path) - full_generation = { "semantic_prompt": semantic_tokens, "coarse_prompt": codes_copy[:2, :], "fine_prompt": codes_copy } + full_generation = {"semantic_prompt": semantic_tokens, + "coarse_prompt": codes_copy[:2, :], "fine_prompt": codes_copy} api.write_seg_npz(output_path, full_generation) print("Well I made a bunch of voices...") diff --git a/bark_infinity/config.py b/bark_infinity/config.py index 35437a0c..0efdc05a 100644 --- a/bark_infinity/config.py +++ b/bark_infinity/config.py @@ -1,3 +1,5 @@ +from rich_argparse import RichHelpFormatter +import argparse import logging from io import StringIO from rich.console import Console @@ -21,119 +23,183 @@ } -VALID_HISTORY_PROMPT_DIRS = ['bark/assets/prompts/', 'bark_infinity/assets/prompts/','custom_speakers/'] +VALID_HISTORY_PROMPT_DIRS = ['bark/assets/prompts/', + 'bark_infinity/assets/prompts/', 'custom_speakers/'] DEFAULTS = { 'input': [ - ('text_prompt', {'value': None, 'type': str, 'help': "Text prompt to generate audio from."}), - ('list_speakers', {'value': None, 'type': bool, 'help': "List available speakers."}), - ('dry_run', {'value': False, 'type': bool, 'help': "Don't generate audio. Useful for testing."}), - ('history_prompt', {'value': None, 'type': str, 'help': "Text prompt to generate audio from."}), - ('prompt_file', {'value': None, 'type': str, 'help': "Text prompt to generate audio from."}), - ('split_input_into_separate_prompts_by', {'value': None, 'type': str, 'help': "Split input into separate prompts, each with it's own wav file.", 'choices': CHOICES['split_options']}), - ('split_input_into_separate_prompts_by_value', {'value': None, 'type': str, 'help': "The number of words, lines, sentences, rhymes, alliterations, or the value of the specific string to split your prompts by."}), + ('text_prompt', {'value': None, 'type': str, + 'help': "Text prompt to generate audio from."}), + ('list_speakers', {'value': None, 'type': bool, + 'help': "List available speakers."}), + ('dry_run', {'value': False, 'type': bool, + 'help': "Don't generate audio. Useful for testing."}), + ('history_prompt', {'value': None, 'type': str, + 'help': "Text prompt to generate audio from."}), + ('prompt_file', {'value': None, 'type': str, + 'help': "Text prompt to generate audio from."}), + ('split_input_into_separate_prompts_by', { + 'value': None, 'type': str, 'help': "Split input into separate prompts, each with it's own wav file.", 'choices': CHOICES['split_options']}), + ('split_input_into_separate_prompts_by_value', { + 'value': None, 'type': str, 'help': "The number of words, lines, sentences, rhymes, alliterations, or the value of the specific string to split your prompts by."}), ], - 'output': [ - ('always_save_speaker', {'value': True, 'type': bool, 'help': "Save the speaker.npz files for every generated audio clip. Even history prompts, because the voice will be slightly different after the generation if you save it again."}), - ('output_iterations', {'value': 1, 'type': int, 'help': "Number of audio clips to generate per prompt."}), - ('output_filename', {'value': None, 'type': str, 'help': "Output filename. If not provided, a unique filename will be generated based on the text prompt and other parameters."}), - ('output_dir', {'value': 'bark_samples/', 'type': str, 'help': "Output directory."}), + 'output': [ + ('always_save_speaker', {'value': True, 'type': bool, + 'help': "Save the speaker.npz files for every generated audio clip. Even history prompts, because the voice will be slightly different after the generation if you save it again."}), + ('output_iterations', {'value': 1, 'type': int, + 'help': "Number of audio clips to generate per prompt."}), + ('output_filename', {'value': None, 'type': str, + 'help': "Output filename. If not provided, a unique filename will be generated based on the text prompt and other parameters."}), + ('output_dir', {'value': 'bark_samples/', + 'type': str, 'help': "Output directory."}), ('hoarder_mode', {'value': False, 'type': bool, 'help': "Who wants to make a cool audio clip and not able to reproduce it in the future? Save it all! Creates a sub directory for each clip that is more than one segment long, because it's kind of a lot."}), - ('extra_stats', {'value': False, 'type': bool, 'help': "Extra stats in the filename."}), + ('extra_stats', {'value': False, 'type': bool, + 'help': "Extra stats in the filename."}), ], 'model': [ - ('text_use_gpu', {'value': True, 'type': bool, 'help': "Load the text model on the GPU."}), - ('text_use_small', {'value': False, 'type': bool, 'help': "Use a smaller/faster text model."}), - ('coarse_use_gpu', {'value': True, 'type': bool, 'help': "Load the coarse model on the GPU."}), - ('coarse_use_small', {'value': False, 'type': bool, 'help': "Use a smaller/faster coarse model."}), - ('fine_use_gpu', {'value': True, 'type': bool, 'help': "Load the fine model on the GPU."}), - ('fine_use_small', {'value': False, 'type': bool, 'help': "Use a smaller/faster fine model."}), - ('codec_use_gpu', {'value': True, 'type': bool, 'help': "Load the codec model on the GPU."}), - ('force_reload', {'value': False, 'type': bool, 'help': "Force the models to be downloaded again."}), - - ('GLOBAL_ENABLE_MPS', {'value': None, 'type': bool, 'help': "Apple M1 Hardware Acceleration."}), - - ('USE_SMALL_MODELS', {'value': None, 'type': bool, 'help': "Set OS env for small models."}), - - ('OFFLOAD_CPU', {'value': None, 'type': bool, 'help': "Offload models when not in use, saves a ton of GPU memory and almost as fast."}), + ('text_use_gpu', {'value': True, 'type': bool, + 'help': "Load the text model on the GPU."}), + ('text_use_small', {'value': False, 'type': bool, + 'help': "Use a smaller/faster text model."}), + ('coarse_use_gpu', {'value': True, 'type': bool, + 'help': "Load the coarse model on the GPU."}), + ('coarse_use_small', {'value': False, 'type': bool, + 'help': "Use a smaller/faster coarse model."}), + ('fine_use_gpu', {'value': True, 'type': bool, + 'help': "Load the fine model on the GPU."}), + ('fine_use_small', {'value': False, 'type': bool, + 'help': "Use a smaller/faster fine model."}), + ('codec_use_gpu', {'value': True, 'type': bool, + 'help': "Load the codec model on the GPU."}), + ('force_reload', {'value': False, 'type': bool, + 'help': "Force the models to be downloaded again."}), + + ('GLOBAL_ENABLE_MPS', {'value': None, 'type': bool, + 'help': "Apple M1 Hardware Acceleration."}), + + ('USE_SMALL_MODELS', {'value': None, 'type': bool, + 'help': "Set OS env for small models."}), + + ('OFFLOAD_CPU', {'value': None, 'type': bool, + 'help': "Offload models when not in use, saves a ton of GPU memory and almost as fast."}), ], 'bark_model_parameters': [ - ('text_temp', {'value': 0.7, 'type': float, 'help': "Text temperature. "}), - ('waveform_temp', {'value': 0.7, 'type': float, 'help': "Waveform temperature."}), - ('confused_travolta_mode', {'value': None, 'type': bool, 'help': "Just for fun. Mostly."}), - ('silent', {'value': False, 'type': bool, 'help': "Disable progress bar."}), + ('text_temp', {'value': 0.7, 'type': float, + 'help': "Text temperature. "}), + ('waveform_temp', {'value': 0.7, 'type': float, + 'help': "Waveform temperature."}), + ('confused_travolta_mode', { + 'value': None, 'type': bool, 'help': "Just for fun. Mostly."}), + ('silent', {'value': False, 'type': bool, + 'help': "Disable progress bar."}), ('seed', {'value': None, 'type': int, 'help': "Random seed for a single clip of audio. This sets the seed one time before all three models, but if you have multiple clips, it sets the same seed for every segment. You probably want to use --single_starting_seed instead in most cases."}), ], # todo split by one of the options, count by the other. splitting by phrase, and counting by word, is probably pretty tgood. 'generating_long_clips': [ ('stable_mode_interval', {'value': 1, 'type': int, 'help': "Optional. stable_mode_interval set to 1 means every 14s clip uses the original speaker .npz file, or the first 14s clip of a random voice. 0 means the previous file is continues. 3 means the speaker history is carried forward 3 times, and then reset back to the original. Not needed at all for short clips. "}), - ('single_starting_seed', {'value': None, 'type': int, 'help': "Random seed that it just set once at the start. This is probalby the seed you want."}), + ('single_starting_seed', {'value': None, 'type': int, + 'help': "Random seed that it just set once at the start. This is probalby the seed you want."}), + + ('split_character_goal_length', { + 'value': 110, 'type': int, 'help': "Split your text_prompt into < 14s chunks of about many characters, general splitter."}), + ('split_character_max_length', { + 'value': 170, 'type': int, 'help': "Split your text_prompt into < 14s, ceiling value."}), + + ('add_silence_between_segments', { + 'value': 0.25, 'type': float, 'help': "Add a bit of silence between joined audio segments. Works good if you splitting your text on copmlete sentences or phrases, or if you are using the same prompt every segment (stable_mode_interval = 1). If you are using stable_mode_interval = 0 it might be worse."}), + + ('split_each_text_prompt_by', { + 'value': None, 'type': str, 'help': "Bark only generates 14s at a time, so the text_prompt needs to be split into chunks smaller than that.", 'choices': CHOICES['split_options']}), + ('split_each_text_prompt_by_value', { + 'value': None, 'type': int, 'help': "The number of words, lines, sentences, rhymes, alliterations, or the value of the specific string to split your text_prompt into < 14s chunks."}), + ('extra_confused_travolta_mode', { + 'value': None, 'type': int, 'help': "Like the name says... 1 for more, 2 for way more, the level of confusion now goes to infinity."}), + + + ('semantic_history_starting_weight', { + 'value': 1.0, 'type': float, 'help': ""}), + ('semantic_history_future_weight', { + 'value': 1.0, 'type': float, 'help': ""}), + ('semantic_prev_segment_weight', { + 'value': 0.5, 'type': float, 'help': ""}), + ('coarse_history_starting_weight', { + 'value': 1.0, 'type': float, 'help': ""}), + ('coarse_history_future_weight', { + 'value': 0.5, 'type': float, 'help': ""}), + ('coarse_prev_segment_weight', { + 'value': 0.5, 'type': float, 'help': ""}), + ('fine_history_starting_weight', { + 'value': 1.0, 'type': float, 'help': ""}), + ('fine_history_future_weight', { + 'value': 0.0, 'type': float, 'help': ""}), + ('fine_prev_segment_weight', { + 'value': 0.0, 'type': float, 'help': ""}), + ('custom_audio_processing_function', { + 'value': None, 'type': int, 'help': "Specify a python function callback which determines when and how much of the speaker context to keep or remove or reset. (Not in this version.)"}), - ('split_character_goal_length', {'value': 110, 'type': int, 'help': "Split your text_prompt into < 14s chunks of about many characters, general splitter."}), - ('split_character_max_length', {'value': 170, 'type': int, 'help': "Split your text_prompt into < 14s, ceiling value."}), - ('add_silence_between_segments', {'value': 0.25, 'type': float, 'help': "Add a bit of silence between joined audio segments. Works good if you splitting your text on copmlete sentences or phrases, or if you are using the same prompt every segment (stable_mode_interval = 1). If you are using stable_mode_interval = 0 it might be worse."}), - - ('split_each_text_prompt_by', {'value': None, 'type': str, 'help': "Bark only generates 14s at a time, so the text_prompt needs to be split into chunks smaller than that.",'choices': CHOICES['split_options']}), - ('split_each_text_prompt_by_value', {'value': None, 'type': int, 'help': "The number of words, lines, sentences, rhymes, alliterations, or the value of the specific string to split your text_prompt into < 14s chunks."}), - ('extra_confused_travolta_mode', {'value': None, 'type': int, 'help': "Like the name says... 1 for more, 2 for way more, the level of confusion now goes to infinity."}), - - - ('semantic_history_starting_weight', {'value': 1.0, 'type': float, 'help': ""}), - ('semantic_history_future_weight', {'value': 1.0, 'type': float, 'help': ""}), - ('semantic_prev_segment_weight', {'value': 0.5, 'type': float, 'help': ""}), - ('coarse_history_starting_weight', {'value': 1.0, 'type': float, 'help': ""}), - ('coarse_history_future_weight', {'value': 0.5, 'type': float, 'help': ""}), - ('coarse_prev_segment_weight', {'value': 0.5, 'type': float, 'help': ""}), - ('fine_history_starting_weight', {'value': 1.0, 'type': float, 'help': ""}), - ('fine_history_future_weight', {'value': 0.0, 'type': float, 'help': ""}), - ('fine_prev_segment_weight', {'value': 0.0, 'type': float, 'help': ""}), - ('custom_audio_processing_function', {'value': None, 'type': int, 'help': "Specify a python function callback which determines when and how much of the speaker context to keep or remove or reset. (Not in this version.)"}), - - ], 'convenience': [ - ('use_smaller_models', {'value': False, 'type': bool, 'help': "Use all small models. Overrides --text_use_small, --coarse_use_small, --fine_use_small. You can probably use big models just fine by default in the latest version though!"}), + ('use_smaller_models', {'value': False, 'type': bool, + 'help': "Use all small models. Overrides --text_use_small, --coarse_use_small, --fine_use_small. You can probably use big models just fine by default in the latest version though!"}), ], 'advanced': [ - ('semantic_temp', {'value': 0.7, 'type': float, 'help': "Temperature for semantic function."}), - ('semantic_top_k', {'value': None, 'type': int, 'help': "Top K for semantic function."}), - ('semantic_top_p', {'value': None, 'type': float, 'help': "Top P for semantic function."}), - ('semantic_min_eos_p', {'value': 0.2, 'type': float, 'help': "Minimum EOS probability for semantic function."}), - ('semantic_max_gen_duration_s', {'value': None, 'type': float, 'help': "Maximum generation duration for semantic function. "}), - ('semantic_allow_early_stop', {'value': True, 'type': bool, 'help': "The secret behind Confused Travolta Mode."}), - ('semantic_use_kv_caching', {'value': True, 'type': bool, 'help': "Use key-value caching. Probably faster with no quality loss."}), - ('semantic_seed', {'value': None, 'type': int, 'help': "Lock semantic seed"}), - ('semantic_history_oversize_limit', {'value': None, 'type': int, 'help': "Maximum size of semantic history, hardcoded to 256. Increasing seems terrible but descreasing it may be useful to lower the value and get variations on existing speakers, or try to fine-tune a bit."}), - - ('coarse_temp', {'value': 0.7, 'type': float, 'help': "Temperature for fine function."}), - ('coarse_top_k', {'value': None, 'type': int, 'help': "Top K for coarse function. "}), - ('coarse_top_p', {'value': None, 'type': float, 'help': "Top P for coarse function. "}), - ('coarse_max_coarse_history', {'value': 630, 'type': int, 'help': "Maximum coarse history for coarse function."}), - ('coarse_sliding_window_len', {'value': 60, 'type': int, 'help': "Sliding window length for coarse function."}), - ('coarse_kv_caching', {'value': True, 'type': bool, 'help': "Use key-value caching. Probably faster with no quality loss."}), - ('coarse_seed', {'value': None, 'type': int, 'help': "Lock coarse seed"}), - ('coarse_history_time_alignment_hack', {'value': -2, 'type': int, 'help': "Can try up or down a few notches to see if your audio align better"}), - - ('fine_temp', {'value': 0.5, 'type': float, 'help': "Temperature for fine function."}), + ('semantic_temp', {'value': 0.7, 'type': float, + 'help': "Temperature for semantic function."}), + ('semantic_top_k', {'value': None, 'type': int, + 'help': "Top K for semantic function."}), + ('semantic_top_p', {'value': None, 'type': float, + 'help': "Top P for semantic function."}), + ('semantic_min_eos_p', {'value': 0.2, 'type': float, + 'help': "Minimum EOS probability for semantic function."}), + ('semantic_max_gen_duration_s', { + 'value': None, 'type': float, 'help': "Maximum generation duration for semantic function. "}), + ('semantic_allow_early_stop', { + 'value': True, 'type': bool, 'help': "The secret behind Confused Travolta Mode."}), + ('semantic_use_kv_caching', {'value': True, 'type': bool, + 'help': "Use key-value caching. Probably faster with no quality loss."}), + ('semantic_seed', {'value': None, + 'type': int, 'help': "Lock semantic seed"}), + ('semantic_history_oversize_limit', { + 'value': None, 'type': int, 'help': "Maximum size of semantic history, hardcoded to 256. Increasing seems terrible but descreasing it may be useful to lower the value and get variations on existing speakers, or try to fine-tune a bit."}), + + ('coarse_temp', {'value': 0.7, 'type': float, + 'help': "Temperature for fine function."}), + ('coarse_top_k', {'value': None, 'type': int, + 'help': "Top K for coarse function. "}), + ('coarse_top_p', {'value': None, 'type': float, + 'help': "Top P for coarse function. "}), + ('coarse_max_coarse_history', { + 'value': 630, 'type': int, 'help': "Maximum coarse history for coarse function."}), + ('coarse_sliding_window_len', { + 'value': 60, 'type': int, 'help': "Sliding window length for coarse function."}), + ('coarse_kv_caching', {'value': True, 'type': bool, + 'help': "Use key-value caching. Probably faster with no quality loss."}), + ('coarse_seed', {'value': None, + 'type': int, 'help': "Lock coarse seed"}), + ('coarse_history_time_alignment_hack', { + 'value': -2, 'type': int, 'help': "Can try up or down a few notches to see if your audio align better"}), + + ('fine_temp', {'value': 0.5, 'type': float, + 'help': "Temperature for fine function."}), ('fine_seed', {'value': None, 'type': int, 'help': "Lock fine seed"}), - ('render_npz_samples', {'value': False, 'type': bool, 'help': "Give this a directory of .npz files and it generaates sample audio clips from them."}), - ('loglevel', {'value': 'WARNING', 'type': str, 'help': "Logging level. Choices are DEBUG, INFO, WARNING, ERROR, CRITICAL.", 'choices': CHOICES['log_levels']}), + ('render_npz_samples', {'value': False, 'type': bool, + 'help': "Give this a directory of .npz files and it generaates sample audio clips from them."}), + ('loglevel', {'value': 'WARNING', 'type': str, + 'help': "Logging level. Choices are DEBUG, INFO, WARNING, ERROR, CRITICAL.", 'choices': CHOICES['log_levels']}), ], } - - - def get_default_values(group_name): if group_name in DEFAULTS: return {key: value['value'] for key, value in DEFAULTS[group_name]} return {} + def load_all_defaults(**kwargs): for group_name in DEFAULTS: default_values = get_default_values(group_name) @@ -143,17 +209,11 @@ def load_all_defaults(**kwargs): return kwargs - -import argparse -from rich_argparse import RichHelpFormatter - def create_argument_parser(): parser = argparse.ArgumentParser(description=""" Bark is a text-to-speech tool that uses machine learning to synthesize speech from text and other audio sources """, formatter_class=RichHelpFormatter) - - help_tags = { 'input': "Input settings", 'output': "Output settings", @@ -165,7 +225,8 @@ def create_argument_parser(): } for group_name, arguments in DEFAULTS.items(): - group = parser.add_argument_group(group_name, help_tags.get(group_name, "")) + group = parser.add_argument_group( + group_name, help_tags.get(group_name, "")) add_arguments_to_group(group, arguments) return parser @@ -177,12 +238,13 @@ def add_arguments_to_group(group, arguments, help_tag=""): help_text = f"{arg['help']} Default: {arg['value']}" if 'choices' in arg: help_text += f" Choices: {', '.join(map(str, arg['choices']))}" - - #if arg['type'] == bool: - #group.add_argument(f"--{key}", action='store_true', help=help_text) - #else: - - group.add_argument(f"--{key}", type=arg['type'], help=help_text, choices=arg.get('choices')) + + # if arg['type'] == bool: + # group.add_argument(f"--{key}", action='store_true', help=help_text) + # else: + + group.add_argument( + f"--{key}", type=arg['type'], help=help_text, choices=arg.get('choices')) def update_group_args_with_defaults(args): @@ -193,4 +255,4 @@ def update_group_args_with_defaults(args): updated_args[key] = value['value'] else: updated_args[key] = getattr(args, key) - return updated_args \ No newline at end of file + return updated_args diff --git a/bark_infinity/generation.py b/bark_infinity/generation.py index 6dbab2b6..45a1d97b 100644 --- a/bark_infinity/generation.py +++ b/bark_infinity/generation.py @@ -82,13 +82,12 @@ def autocast(): ALLOWED_PROMPTS.add(f"{prefix}{lang}_speaker_{n}") - - CUR_PATH = os.path.dirname(os.path.abspath(__file__)) default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache") -CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0") +CACHE_DIR = os.path.join( + os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0") USE_SMALL_MODELS = os.environ.get("SUNO_USE_SMALL_MODELS", False) @@ -150,7 +149,8 @@ def _get_ckpt_path(model_type, use_small=False): def _download(from_hf_path, file_name): os.makedirs(CACHE_DIR, exist_ok=True) - hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR) + hf_hub_download(repo_id=from_hf_path, + filename=file_name, local_dir=CACHE_DIR) class InferenceContext: @@ -195,7 +195,6 @@ def clean_models(model_key=None): # def _load_model(ckpt_path, device, use_small=False, model_type="text"): - def _load_codec_model(device): @@ -207,9 +206,6 @@ def _load_codec_model(device): return model - - - def load_codec_model(use_gpu=True, force_reload=False): global models global models_devices @@ -228,6 +224,7 @@ def load_codec_model(use_gpu=True, force_reload=False): models[model_key].to(device) return models[model_key] + """ def preload_models( text_use_gpu=True, @@ -273,18 +270,20 @@ def _load_history_prompt(history_prompt_input): if history_prompt_input not in ALLOWED_PROMPTS: raise ValueError("history prompt not found") history_prompt = np.load( - os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt_input}.npz") + os.path.join(CUR_PATH, "assets", "prompts", + f"{history_prompt_input}.npz") ) elif isinstance(history_prompt_input, dict): - assert("semantic_prompt" in history_prompt_input) - assert("coarse_prompt" in history_prompt_input) - assert("fine_prompt" in history_prompt_input) + assert ("semantic_prompt" in history_prompt_input) + assert ("coarse_prompt" in history_prompt_input) + assert ("fine_prompt" in history_prompt_input) history_prompt = history_prompt_input else: raise ValueError("history prompt format unrecognized") return history_prompt # removed semantic_history_oversize_limit because merging + def generate_text_semantic( text, history_prompt=None, @@ -300,7 +299,7 @@ def generate_text_semantic( """Generate semantic tokens from text.""" logger.debug(locals()) - + assert isinstance(text, str) text = _normalize_whitespace(text) assert len(text.strip()) > 0 @@ -375,7 +374,8 @@ def generate_text_semantic( relevant_logits = logits[0, 0, :SEMANTIC_VOCAB_SIZE] if allow_early_stop: relevant_logits = torch.hstack( - (relevant_logits, logits[0, 0, [SEMANTIC_PAD_TOKEN]]) # eos + (relevant_logits, logits[0, 0, + [SEMANTIC_PAD_TOKEN]]) # eos ) if top_p is not None: # faster to convert to numpy @@ -390,9 +390,11 @@ def generate_text_semantic( sorted_indices_to_remove[0] = False relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf relevant_logits = torch.from_numpy(relevant_logits) - relevant_logits = relevant_logits.to(logits_device).type(logits_dtype) + relevant_logits = relevant_logits.to( + logits_device).type(logits_dtype) if top_k is not None: - v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1))) + v, _ = torch.topk(relevant_logits, min( + top_k, relevant_logits.size(-1))) relevant_logits[relevant_logits < v[-1]] = -float("Inf") probs = F.softmax(relevant_logits / temp, dim=-1) # multinomial bugged on mps: shuttle to cpu if necessary @@ -423,7 +425,7 @@ def generate_text_semantic( pbar.update(req_pbar_state - pbar_state) pbar_state = req_pbar_state pbar.close() - out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :] + out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1:] if OFFLOAD_CPU: model.to("cpu") assert all(0 <= out) and all(out < SEMANTIC_VOCAB_SIZE) @@ -456,7 +458,7 @@ def generate_coarse( sliding_window_len=60, use_kv_caching=False, ): - + logger.debug(locals()) """Generate coarse audio codes from semantic tokens.""" @@ -469,13 +471,15 @@ def generate_coarse( ) assert 60 <= max_coarse_history <= 630 assert max_coarse_history + sliding_window_len <= 1024 - 256 - semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS - max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio)) + semantic_to_coarse_ratio = COARSE_RATE_HZ / \ + SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS + max_semantic_history = int( + np.floor(max_coarse_history / semantic_to_coarse_ratio)) if history_prompt is not None: history_prompt = _load_history_prompt(history_prompt) x_semantic_history = history_prompt["semantic_prompt"] x_coarse_history = history_prompt["coarse_prompt"] - + assert ( isinstance(x_semantic_history, np.ndarray) and len(x_semantic_history.shape) == 1 @@ -493,7 +497,8 @@ def generate_coarse( == round(semantic_to_coarse_ratio / N_COARSE_CODEBOOKS, 1) ) ) - x_coarse_history = _flatten_codebooks(x_coarse_history) + SEMANTIC_VOCAB_SIZE + x_coarse_history = _flatten_codebooks( + x_coarse_history) + SEMANTIC_VOCAB_SIZE # trim histories correctly n_semantic_hist_provided = np.min( @@ -503,9 +508,12 @@ def generate_coarse( int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)), ] ) - n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio)) - x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(np.int32) - x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32) + n_coarse_hist_provided = int( + round(n_semantic_hist_provided * semantic_to_coarse_ratio)) + x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype( + np.int32) + x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype( + np.int32) # TODO: bit of a hack for time alignment (sounds better) x_coarse_history = x_coarse_history[:-2] else: @@ -523,7 +531,8 @@ def generate_coarse( # start loop n_steps = int( round( - np.floor(len(x_semantic) * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS) + np.floor(len(x_semantic) * + semantic_to_coarse_ratio / N_COARSE_CODEBOOKS) * N_COARSE_CODEBOOKS ) ) @@ -537,9 +546,11 @@ def generate_coarse( n_window_steps = int(np.ceil(n_steps / sliding_window_len)) n_step = 0 for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent): - semantic_idx = base_semantic_idx + int(round(n_step / semantic_to_coarse_ratio)) + semantic_idx = base_semantic_idx + \ + int(round(n_step / semantic_to_coarse_ratio)) # pad from right side - x_in = x_semantic_in[:, np.max([0, semantic_idx - max_semantic_history]) :] + x_in = x_semantic_in[:, np.max( + [0, semantic_idx - max_semantic_history]):] x_in = x_in[:, :256] x_in = F.pad( x_in, @@ -565,12 +576,15 @@ def generate_coarse( else: x_input = x_in - logits, kv_cache = model(x_input, use_cache=use_kv_caching, past_kv=kv_cache) + logits, kv_cache = model( + x_input, use_cache=use_kv_caching, past_kv=kv_cache) logit_start_idx = ( - SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * CODEBOOK_SIZE + SEMANTIC_VOCAB_SIZE + + (1 - int(is_major_step)) * CODEBOOK_SIZE ) logit_end_idx = ( - SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * CODEBOOK_SIZE + SEMANTIC_VOCAB_SIZE + + (2 - int(is_major_step)) * CODEBOOK_SIZE ) relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx] if top_p is not None: @@ -586,9 +600,11 @@ def generate_coarse( sorted_indices_to_remove[0] = False relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf relevant_logits = torch.from_numpy(relevant_logits) - relevant_logits = relevant_logits.to(logits_device).type(logits_dtype) + relevant_logits = relevant_logits.to( + logits_device).type(logits_dtype) if top_k is not None: - v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1))) + v, _ = torch.topk(relevant_logits, min( + top_k, relevant_logits.size(-1))) relevant_logits[relevant_logits < v[-1]] = -float("Inf") probs = F.softmax(relevant_logits / temp, dim=-1) # multinomial bugged on mps: shuttle to cpu if necessary @@ -607,10 +623,12 @@ def generate_coarse( del x_semantic_in if OFFLOAD_CPU: model.to("cpu") - gen_coarse_arr = x_coarse_in.detach().cpu().numpy().squeeze()[len(x_coarse_history) :] + gen_coarse_arr = x_coarse_in.detach().cpu().numpy().squeeze()[ + len(x_coarse_history):] del x_coarse_in assert len(gen_coarse_arr) == n_steps - gen_coarse_audio_arr = gen_coarse_arr.reshape(-1, N_COARSE_CODEBOOKS).T - SEMANTIC_VOCAB_SIZE + gen_coarse_audio_arr = gen_coarse_arr.reshape( + -1, N_COARSE_CODEBOOKS).T - SEMANTIC_VOCAB_SIZE for n in range(1, N_COARSE_CODEBOOKS): gen_coarse_audio_arr[n, :] -= n * CODEBOOK_SIZE _clear_cuda_cache() @@ -625,7 +643,6 @@ def generate_fine( ): logger.debug(locals()) - """Generate full audio codes from coarse audio codes.""" assert ( isinstance(x_coarse_gen, np.ndarray) @@ -685,22 +702,26 @@ def generate_fine( in_arr = np.hstack( [ in_arr, - np.zeros((N_FINE_CODEBOOKS, n_remove_from_end), dtype=np.int32) + CODEBOOK_SIZE, + np.zeros((N_FINE_CODEBOOKS, n_remove_from_end), + dtype=np.int32) + CODEBOOK_SIZE, ] ) # we can be lazy about fractional loop and just keep overwriting codebooks - n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1 + n_loops = np.max( + [0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1 with _inference_mode(): in_arr = torch.tensor(in_arr.T).to(device) for n in tqdm.tqdm(range(n_loops), disable=silent): start_idx = np.min([n * 512, in_arr.shape[0] - 1024]) - start_fill_idx = np.min([n_history + n * 512, in_arr.shape[0] - 512]) + start_fill_idx = np.min( + [n_history + n * 512, in_arr.shape[0] - 512]) rel_start_fill_idx = start_fill_idx - start_idx - in_buffer = in_arr[start_idx : start_idx + 1024, :][None] + in_buffer = in_arr[start_idx: start_idx + 1024, :][None] for nn in range(n_coarse, N_FINE_CODEBOOKS): logits = model(nn, in_buffer) if temp is None: - relevant_logits = logits[0, rel_start_fill_idx:, :CODEBOOK_SIZE] + relevant_logits = logits[0, + rel_start_fill_idx:, :CODEBOOK_SIZE] codebook_preds = torch.argmax(relevant_logits, -1) else: relevant_logits = logits[0, :, :CODEBOOK_SIZE] / temp @@ -711,7 +732,8 @@ def generate_fine( probs = probs.to("cpu") codebook_preds = torch.hstack( [ - torch.multinomial(probs[nnn], num_samples=1).to(inf_device) + torch.multinomial( + probs[nnn], num_samples=1).to(inf_device) for nnn in range(rel_start_fill_idx, 1024) ] ) @@ -720,7 +742,7 @@ def generate_fine( # transfer over info into model_in and convert to numpy for nn in range(n_coarse, N_FINE_CODEBOOKS): in_arr[ - start_fill_idx : start_fill_idx + (1024 - rel_start_fill_idx), nn + start_fill_idx: start_fill_idx + (1024 - rel_start_fill_idx), nn ] = in_buffer[0, rel_start_fill_idx:, nn] del in_buffer gen_fine_arr = in_arr.detach().cpu().numpy().squeeze().T @@ -758,14 +780,15 @@ def codec_decode(fine_tokens): return audio_arr -## Added: +# Added: # Just overriding this because somehow I keep loading the wrong models? def load_model(use_gpu=True, use_small=False, force_reload=False, model_type="text"): logger.debug(locals()) - _load_model_f = funcy.partial(_load_model, model_type=model_type, use_small=use_small) + _load_model_f = funcy.partial( + _load_model, model_type=model_type, use_small=use_small) if model_type not in ("text", "coarse", "fine"): raise NotImplementedError() global models @@ -803,13 +826,17 @@ def _load_model(ckpt_path, device, use_small=False, model_type="text"): model_key = f"{model_type}_small" if use_small or USE_SMALL_MODELS else model_type model_info = REMOTE_MODEL_PATHS[model_key] if not os.path.exists(ckpt_path): - logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.") - remote_filename = hf_hub_url(model_info["repo_id"], model_info["file_name"]) - ## added, actually screw logging, just print, rest easy always knowing which model is loaded - print(f"Downloading {model_key} {model_info['repo_id']} remote model file {remote_filename} {model_info['file_name']} to {CACHE_DIR}") # added + logger.info( + f"{model_type} model not found, downloading into `{CACHE_DIR}`.") + remote_filename = hf_hub_url( + model_info["repo_id"], model_info["file_name"]) + # added, actually screw logging, just print, rest easy always knowing which model is loaded + # added + print( + f"Downloading {model_key} {model_info['repo_id']} remote model file {remote_filename} {model_info['file_name']} to {CACHE_DIR}") _download(model_info["repo_id"], model_info["file_name"]) - ## added - print(f"Loading {model_key} model from {ckpt_path} to {device}") # added + # added + print(f"Loading {model_key} model from {ckpt_path} to {device}") # added checkpoint = torch.load(ckpt_path, map_location=device) # this is a hack @@ -825,11 +852,12 @@ def _load_model(ckpt_path, device, use_small=False, model_type="text"): unwanted_prefix = "_orig_mod." for k, v in list(state_dict.items()): if k.startswith(unwanted_prefix): - state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k) + state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) extra_keys = set(state_dict.keys()) - set(model.state_dict().keys()) extra_keys = set([k for k in extra_keys if not k.endswith(".attn.bias")]) missing_keys = set(model.state_dict().keys()) - set(state_dict.keys()) - missing_keys = set([k for k in missing_keys if not k.endswith(".attn.bias")]) + missing_keys = set( + [k for k in missing_keys if not k.endswith(".attn.bias")]) if len(extra_keys) != 0: raise ValueError(f"extra keys found: {extra_keys}") if len(missing_keys) != 0: @@ -837,13 +865,15 @@ def _load_model(ckpt_path, device, use_small=False, model_type="text"): model.load_state_dict(state_dict, strict=False) n_params = model.get_num_params() val_loss = checkpoint["best_val_loss"].item() - logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss") + logger.info( + f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss") model.eval() model.to(device) del checkpoint, state_dict _clear_cuda_cache() if model_type == "text": - tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased") + tokenizer = BertTokenizer.from_pretrained( + "bert-base-multilingual-cased") return { "model": model, "tokenizer": tokenizer, @@ -863,11 +893,10 @@ def preload_models( ): """Load all the necessary models for the pipeline.""" - - # What is going on here - logger.debug(f"USE_SMALL_MODELS = {USE_SMALL_MODELS} GLOBAL_ENABLE_MPS = {GLOBAL_ENABLE_MPS}, OFFLOAD_CPU = {OFFLOAD_CPU}") - logger.debug(f"text_use_gpu = {text_use_gpu}, text_use_small = {text_use_small}, coarse_use_gpu = {coarse_use_gpu}, coarse_use_small = {coarse_use_small}, fine_use_gpu = {fine_use_gpu}, fine_use_small = {fine_use_small}, codec_use_gpu = {codec_use_gpu}, force_reload = {force_reload}") + logger.debug( + f"USE_SMALL_MODELS = {USE_SMALL_MODELS} GLOBAL_ENABLE_MPS = {GLOBAL_ENABLE_MPS}, OFFLOAD_CPU = {OFFLOAD_CPU}") + logger.debug(f"text_use_gpu = {text_use_gpu}, text_use_small = {text_use_small}, coarse_use_gpu = {coarse_use_gpu}, coarse_use_small = {coarse_use_small}, fine_use_gpu = {fine_use_gpu}, fine_use_small = {fine_use_small}, codec_use_gpu = {codec_use_gpu}, force_reload = {force_reload}") # Is this actually bugged in Bark main, not my fault? This is checked further down the stack, but the chkpt_path is not updated in places @@ -877,11 +906,12 @@ def preload_models( text_use_small = True coarse_use_small = True fine_use_small = True - + if _grab_best_device() == "cpu" and ( text_use_gpu or coarse_use_gpu or fine_use_gpu or codec_use_gpu ): - logger.warning("No GPU being used. Careful, inference might be very slow!") + logger.warning( + "No GPU being used. Careful, inference might be very slow!") _ = load_model( model_type="text", use_gpu=text_use_gpu, use_small=text_use_small, force_reload=force_reload ) @@ -933,7 +963,7 @@ def set_seed(seed: int = 0): # Use default_rng() because it is independent of np.random.seed() seed = np.random.default_rng().integers(1, 2**32 - 1) - assert(0 < seed and seed < 2**32) + assert (0 < seed and seed < 2**32) np.random.seed(seed) random.seed(seed) @@ -944,5 +974,3 @@ def set_seed(seed: int = 0): logger.info(f"Set seed to {seed}") return original_seed if original_seed != 0 else seed - - diff --git a/bark_infinity/model.py b/bark_infinity/model.py index 457b49e7..3e8bb57e 100644 --- a/bark_infinity/model.py +++ b/bark_infinity/model.py @@ -9,6 +9,7 @@ import torch.nn as nn from torch.nn import functional as F + class LayerNorm(nn.Module): """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """ @@ -20,13 +21,15 @@ def __init__(self, ndim, bias): def forward(self, input): return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5) + class CausalSelfAttention(nn.Module): def __init__(self, config): super().__init__() assert config.n_embd % config.n_head == 0 # key, query, value projections for all heads, but in a batch - self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias) + self.c_attn = nn.Linear( + config.n_embd, 3 * config.n_embd, bias=config.bias) # output projection self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias) # regularization @@ -36,21 +39,25 @@ def __init__(self, config): self.n_embd = config.n_embd self.dropout = config.dropout # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary - self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') + self.flash = hasattr(torch.nn.functional, + 'scaled_dot_product_attention') if not self.flash: # print("WARNING: using slow attention. Flash Attention atm needs PyTorch nightly and dropout=0.0") # causal mask to ensure that attention is only applied to the left in the input sequence self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)) - .view(1, 1, config.block_size, config.block_size)) + .view(1, 1, config.block_size, config.block_size)) def forward(self, x, past_kv=None, use_cache=False): - B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd) + B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd) # calculate query, key, values for all heads in batch and move head forward to be the batch dim - q, k ,v = self.c_attn(x).split(self.n_embd, dim=2) - k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) - q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) - v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) + q, k, v = self.c_attn(x).split(self.n_embd, dim=2) + k = k.view(B, T, self.n_head, C // + self.n_head).transpose(1, 2) # (B, nh, T, hs) + q = q.view(B, T, self.n_head, C // + self.n_head).transpose(1, 2) # (B, nh, T, hs) + v = v.view(B, T, self.n_head, C // + self.n_head).transpose(1, 2) # (B, nh, T, hs) if past_kv is not None: past_key = past_kv[0] @@ -71,32 +78,38 @@ def forward(self, x, past_kv=None, use_cache=False): if past_kv is not None: # When `past_kv` is provided, we're doing incremental decoding and `q.shape[2] == 1`: q only contains # the query for the last token. scaled_dot_product_attention interprets this as the first token in the - # sequence, so if is_causal=True it will mask out all attention from it. This is not what we want, so + # sequence, so if is_causal=True it will mask out all attention from it. This is not what we want, so # to work around this we set is_causal=False. is_causal = False else: is_causal = True - y = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout, is_causal=is_causal) + y = torch.nn.functional.scaled_dot_product_attention( + q, k, v, dropout_p=self.dropout, is_causal=is_causal) else: # manual implementation of attention att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) - att = att.masked_fill(self.bias[:,:,FULL_T-T:FULL_T,:FULL_T] == 0, float('-inf')) + att = att.masked_fill( + self.bias[:, :, FULL_T-T:FULL_T, :FULL_T] == 0, float('-inf')) att = F.softmax(att, dim=-1) att = self.attn_dropout(att) - y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) - y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side + y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) + # re-assemble all head outputs side by side + y = y.transpose(1, 2).contiguous().view(B, T, C) # output projection y = self.resid_dropout(self.c_proj(y)) return (y, present) + class MLP(nn.Module): def __init__(self, config): super().__init__() - self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias) - self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias) + self.c_fc = nn.Linear( + config.n_embd, 4 * config.n_embd, bias=config.bias) + self.c_proj = nn.Linear( + 4 * config.n_embd, config.n_embd, bias=config.bias) self.dropout = nn.Dropout(config.dropout) self.gelu = nn.GELU() @@ -107,6 +120,7 @@ def forward(self, x): x = self.dropout(x) return x + class Block(nn.Module): def __init__(self, config, layer_idx): @@ -118,11 +132,13 @@ def __init__(self, config, layer_idx): self.layer_idx = layer_idx def forward(self, x, past_kv=None, use_cache=False): - attn_output, prev_kvs = self.attn(self.ln_1(x), past_kv=past_kv, use_cache=use_cache) + attn_output, prev_kvs = self.attn( + self.ln_1(x), past_kv=past_kv, use_cache=use_cache) x = x + attn_output x = x + self.mlp(self.ln_2(x)) return (x, prev_kvs) + @dataclass class GPTConfig: block_size: int = 1024 @@ -132,7 +148,9 @@ class GPTConfig: n_head: int = 12 n_embd: int = 768 dropout: float = 0.0 - bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + bias: bool = True + class GPT(nn.Module): @@ -144,13 +162,15 @@ def __init__(self, config): self.config = config self.transformer = nn.ModuleDict(dict( - wte = nn.Embedding(config.input_vocab_size, config.n_embd), - wpe = nn.Embedding(config.block_size, config.n_embd), - drop = nn.Dropout(config.dropout), - h = nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]), - ln_f = LayerNorm(config.n_embd, bias=config.bias), + wte=nn.Embedding(config.input_vocab_size, config.n_embd), + wpe=nn.Embedding(config.block_size, config.n_embd), + drop=nn.Dropout(config.dropout), + h=nn.ModuleList([Block(config, idx) + for idx in range(config.n_layer)]), + ln_f=LayerNorm(config.n_embd, bias=config.bias), )) - self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False) + self.lm_head = nn.Linear( + config.n_embd, config.output_vocab_size, bias=False) def get_num_params(self, non_embedding=True): """ @@ -170,10 +190,11 @@ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use b, t = idx.size() if past_kv is not None: assert t == 1 - tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) + # token embeddings of shape (b, t, n_embd) + tok_emb = self.transformer.wte(idx) else: if merge_context: - assert(idx.shape[1] >= 256+256+1) + assert (idx.shape[1] >= 256+256+1) t = idx.shape[1] - 256 else: assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" @@ -181,11 +202,13 @@ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use # forward the GPT model itself if merge_context: tok_emb = torch.cat([ - self.transformer.wte(idx[:,:256]) + self.transformer.wte(idx[:,256:256+256]), - self.transformer.wte(idx[:,256+256:]) + self.transformer.wte( + idx[:, :256]) + self.transformer.wte(idx[:, 256:256+256]), + self.transformer.wte(idx[:, 256+256:]) ], dim=1) else: - tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) + # token embeddings of shape (b, t, n_embd) + tok_emb = self.transformer.wte(idx) if past_kv is None: past_length = 0 @@ -194,11 +217,13 @@ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use past_length = past_kv[0][0].size(-2) if position_ids is None: - position_ids = torch.arange(past_length, t + past_length, dtype=torch.long, device=device) - position_ids = position_ids.unsqueeze(0) # shape (1, t) + position_ids = torch.arange( + past_length, t + past_length, dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0) # shape (1, t) assert position_ids.shape == (1, t) - pos_emb = self.transformer.wpe(position_ids) # position embeddings of shape (1, t, n_embd) + # position embeddings of shape (1, t, n_embd) + pos_emb = self.transformer.wpe(position_ids) x = self.transformer.drop(tok_emb + pos_emb) @@ -213,6 +238,7 @@ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use x = self.transformer.ln_f(x) # inference-time mini-optimization: only forward the lm_head on the very last position - logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim + # note: using list [-1] to preserve the time dim + logits = self.lm_head(x[:, [-1], :]) return (logits, new_kv) diff --git a/bark_infinity/model_fine.py b/bark_infinity/model_fine.py index 6179a851..e92f7951 100644 --- a/bark_infinity/model_fine.py +++ b/bark_infinity/model_fine.py @@ -17,7 +17,8 @@ def __init__(self, config): super().__init__() assert config.n_embd % config.n_head == 0 # key, query, value projections for all heads, but in a batch - self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias) + self.c_attn = nn.Linear( + config.n_embd, 3 * config.n_embd, bias=config.bias) # output projection self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias) # regularization @@ -28,7 +29,8 @@ def __init__(self, config): self.dropout = config.dropout # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary self.flash = ( - hasattr(torch.nn.functional, "scaled_dot_product_attention") and self.dropout == 0.0 + hasattr(torch.nn.functional, + "scaled_dot_product_attention") and self.dropout == 0.0 ) def forward(self, x): @@ -36,9 +38,12 @@ def forward(self, x): # calculate query, key, values for all heads in batch and move head forward to be the batch dim q, k, v = self.c_attn(x).split(self.n_embd, dim=2) - k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) - q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) - v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) + k = k.view(B, T, self.n_head, C // + self.n_head).transpose(1, 2) # (B, nh, T, hs) + q = q.view(B, T, self.n_head, C // + self.n_head).transpose(1, 2) # (B, nh, T, hs) + v = v.view(B, T, self.n_head, C // + self.n_head).transpose(1, 2) # (B, nh, T, hs) # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T) if self.flash: @@ -91,7 +96,8 @@ def __init__(self, config): ), wpe=nn.Embedding(config.block_size, config.n_embd), drop=nn.Dropout(config.dropout), - h=nn.ModuleList([FineBlock(config) for _ in range(config.n_layer)]), + h=nn.ModuleList([FineBlock(config) + for _ in range(config.n_layer)]), ln_f=nn.LayerNorm(config.n_embd), ) ) @@ -112,14 +118,16 @@ def forward(self, pred_idx, idx): ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" assert pred_idx > 0, "cannot predict 0th codebook" assert codes == self.n_codes_total, (b, t, codes) - pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t) + pos = torch.arange(0, t, dtype=torch.long, + device=device).unsqueeze(0) # shape (1, t) # forward the GPT model itself tok_embs = [ wte(idx[:, :, i]).unsqueeze(-1) for i, wte in enumerate(self.transformer.wtes) ] # token embeddings of shape (b, t, n_embd) tok_emb = torch.cat(tok_embs, dim=-1) - pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd) + # position embeddings of shape (1, t, n_embd) + pos_emb = self.transformer.wpe(pos) x = tok_emb[:, :, :, : pred_idx + 1].sum(dim=-1) x = self.transformer.drop(x + pos_emb) for block in self.transformer.h: diff --git a/bark_perform.py b/bark_perform.py index fa4523a2..f4614040 100644 --- a/bark_perform.py +++ b/bark_perform.py @@ -1,3 +1,6 @@ +from bark_infinity import text_processing +from bark_infinity import api +from bark_infinity import generation import argparse import numpy as np @@ -7,11 +10,6 @@ logger = config.logger -from bark_infinity import generation -from bark_infinity import api - -from bark_infinity import text_processing - generation.OFFLOAD_CPU = True generation.USE_SMALL_MODELS = False @@ -28,6 +26,7 @@ """ text_prompts.append(text_prompt) + def get_group_args(group_name, updated_args): # Convert the Namespace object to a dictionary updated_args_dict = vars(updated_args) @@ -38,13 +37,12 @@ def get_group_args(group_name, updated_args): group_args[key] = value return group_args + def main(args): if args.loglevel is not None: logger.setLevel(args.loglevel) - - if args.list_speakers: api.list_speakers() return @@ -75,8 +73,6 @@ def main(args): print( f"WARNING: You are about to process {things} prompts. Consider using '--dry-run' to test things first.") - - """ def preload_models( text_use_gpu=True, @@ -89,15 +85,14 @@ def preload_models( force_reload=False, ): """ - #pprint(args) + # pprint(args) print("Loading Bark models...") if not args.dry_run: - generation.preload_models(args.text_use_gpu, args.text_use_small, args.coarse_use_gpu, args.coarse_use_small, args.fine_use_gpu, args.fine_use_small, args.codec_use_gpu, args.force_reload) + generation.preload_models(args.text_use_gpu, args.text_use_small, args.coarse_use_gpu, args.coarse_use_small, + args.fine_use_gpu, args.fine_use_small, args.codec_use_gpu, args.force_reload) print("Done.") - - for idx, text_prompt in enumerate(text_prompts_to_process, start=1): if len(text_prompts_to_process) > 1: print(f"\nPrompt {idx}/{len(text_prompts_to_process)}:") diff --git a/bark_webui.py b/bark_webui.py index 3251c325..5e5a7c9f 100644 --- a/bark_webui.py +++ b/bark_webui.py @@ -1,3 +1,11 @@ +import functools +import time +import threading +from bark_infinity.clonevoice import clone_voice +from bark_infinity import api +from bark_infinity import generation +from bark_infinity import config +import io import datetime import os import random @@ -8,15 +16,10 @@ from collections import defaultdict from tqdm import tqdm os.environ["TERM"] = "dumb" -import io -from bark_infinity import config logger = config.logger logger.setLevel("INFO") -from bark_infinity import generation -from bark_infinity import api - generation.OFFLOAD_CPU = True generation.USE_SMALL_MODELS = False @@ -28,12 +31,9 @@ glass_theme = gr.themes.Glass() -# If anyone is looking at this code, I just took gradio blocks kitchen sink demo and cut and pasted all over the place, and as usual I know realize I should have just carefully read the Gradio explanation from the beginning. +# If anyone is looking at this code, I just took gradio blocks kitchen sink demo and cut and pasted all over the place, and as usual I know realize I should have just carefully read the Gradio explanation from the beginning. -from bark_infinity.clonevoice import clone_voice -import threading -import time cancel_process = False @@ -63,14 +63,15 @@ overflow-x: hidden; } """ -import functools + def timeout(seconds): def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): result = [None] - thread = threading.Thread(target=lambda: result.__setitem__(0, func(*args, **kwargs))) + thread = threading.Thread( + target=lambda: result.__setitem__(0, func(*args, **kwargs))) thread.start() thread.join(seconds) if thread.is_alive(): @@ -79,6 +80,7 @@ def wrapper(*args, **kwargs): return wrapper return decorator + @timeout(1) # Adjust the timeout value according to your needs def cancellable_generate_audio_long_gradio(*args, **kwargs): global cancel_process @@ -93,6 +95,7 @@ def cancellable_generate_audio_long_gradio(*args, **kwargs): print("Process canceled!") return result + def cancel(): global cancel_process cancel_process = True @@ -101,11 +104,15 @@ def cancel(): if len(sys.argv) > 1: autolaunch = "-autolaunch" in sys.argv + def start_long_running_function_thread(*args, **kwargs): - thread = threading.Thread(target=cancellable_generate_audio_long_gradio, args=args, kwargs=kwargs) + thread = threading.Thread( + target=cancellable_generate_audio_long_gradio, args=args, kwargs=kwargs) thread.start() # I made a CLI app. This is my solution. I'm not proud of it. + + def parse_extra_args(extra_args_str): extra_args = extra_args_str.split('--') parsed_args = {} @@ -128,25 +135,28 @@ def parse_extra_args(extra_args_str): parsed_args[key] = value return parsed_args -def generate_audio_long_gradio(input, npz_dropdown, generated_voices, confused_travolta_mode, stable_mode_interval, split_character_goal_length, split_character_max_length, seed, dry_run,output_iterations,hoarder_mode, text_temp, waveform_temp, semantic_min_eos_p, output_dir, add_silence_between_segments, extra_args_str, progress=gr.Progress(track_tqdm=True)): + +def generate_audio_long_gradio(input, npz_dropdown, generated_voices, confused_travolta_mode, stable_mode_interval, split_character_goal_length, split_character_max_length, seed, dry_run, output_iterations, hoarder_mode, text_temp, waveform_temp, semantic_min_eos_p, output_dir, add_silence_between_segments, extra_args_str, progress=gr.Progress(track_tqdm=True)): print("\n") if input == None or len(input) < 4: print("\nLooks like you forgot to enter a text prompt.") raise gr.Error('Looks like you forgot to enter a text prompt.') - - #print(locals()) + + # print(locals()) kwargs = {} kwargs["text_prompt"] = input - + # I must have screwed up why are these values so messed up if npz_dropdown != '' and npz_dropdown is not None: - if len(npz_dropdown.strip()) > 6: kwargs["history_prompt"] = npz_dropdown + if len(npz_dropdown.strip()) > 6: + kwargs["history_prompt"] = npz_dropdown if generated_voices != '' and generated_voices is not None: - if len(generated_voices.strip()) > 6: kwargs["history_prompt"] = generated_voices + if len(generated_voices.strip()) > 6: + kwargs["history_prompt"] = generated_voices kwargs["confused_travolta_mode"] = confused_travolta_mode kwargs["split_character_goal_length"] = int(split_character_goal_length) kwargs["split_character_max_length"] = int(split_character_max_length) - + if seed != '' and seed is not None: kwargs["single_starting_seed"] = int(seed) @@ -168,14 +178,11 @@ def generate_audio_long_gradio(input, npz_dropdown, generated_voices, confused_t if output_iterations is not None and output_iterations != '': kwargs["output_iterations"] = int(output_iterations) - if output_dir is not None and output_dir != '': kwargs["output_dir"] = output_dir - #this is obviously got to be the wrong way to do this + # this is obviously got to be the wrong way to do this - - if text_temp is not None and text_temp != '': kwargs["text_temp"] = float(text_temp) @@ -186,46 +193,44 @@ def generate_audio_long_gradio(input, npz_dropdown, generated_voices, confused_t kwargs["semantic_min_eos_p"] = float(semantic_min_eos_p) if add_silence_between_segments is not None and add_silence_between_segments != '': - kwargs["add_silence_between_segments"] = float(add_silence_between_segments) - - - - - + kwargs["add_silence_between_segments"] = float( + add_silence_between_segments) # i need to read the docs kwargs.update(parse_extra_args(extra_args_str)) - using_these_params = kwargs.copy() using_these_params["text_prompt"] = "..." print(f"Using these params: {using_these_params}") - - full_generation_segments, audio_arr_segments, final_filename_will_be = api.generate_audio_long_from_gradio(**kwargs) + full_generation_segments, audio_arr_segments, final_filename_will_be = api.generate_audio_long_from_gradio( + **kwargs) if kwargs.get('dry_run', False): final_filename_will_be = "bark_infinity/assets/split_the_text.wav" return final_filename_will_be + def create_npz_dropdown(directories, label): npz_files_by_subfolder = defaultdict(list) - + for directory in directories: for npz_file in glob.glob(os.path.join(directory, '**', '*.npz'), recursive=True): subfolder = os.path.dirname(npz_file) npz_files_by_subfolder[subfolder].append(npz_file) - + sorted_npz_files = [] for subfolder in sorted(npz_files_by_subfolder.keys()): sorted_npz_files.extend(sorted(npz_files_by_subfolder[subfolder])) - + npz_dropdown = gr.Dropdown(sorted_npz_files, label=label) return npz_dropdown + directories = ["custom_speakers/", "bark/assets/prompts/"] outputs_dirs = ["bark_samples/"] + class Logger: def __init__(self, filename): self.terminal = sys.stdout @@ -243,34 +248,48 @@ def isatty(self): return False -sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace', newline='', line_buffering=True) -sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace', newline='', line_buffering=True) +sys.stdout = io.TextIOWrapper( + sys.stdout.buffer, encoding='utf-8', errors='replace', newline='', line_buffering=True) +sys.stderr = io.TextIOWrapper( + sys.stderr.buffer, encoding='utf-8', errors='replace', newline='', line_buffering=True) sys.stdout = Logger("gradio_terminal_ouput.log") + + def test(x): - #print("This is a test") - #print(f"Your function is running with input {x}...") + # print("This is a test") + # print(f"Your function is running with input {x}...") return + def read_logs(): sys.stdout.flush() with open("gradio_terminal_ouput.log", "r", encoding="utf-8") as f: return f.read() - + model_options = [ - ('text_use_gpu', {'value': True, 'type': bool, 'help': "Load the text model on the GPU."}), - ('text_use_small', {'value': False, 'type': bool, 'help': "Use a smaller/faster text model."}), - ('coarse_use_gpu', {'value': True, 'type': bool, 'help': "Load the coarse model on the GPU."}), - ('coarse_use_small', {'value': False, 'type': bool, 'help': "Use a smaller/faster coarse model."}), - ('fine_use_gpu', {'value': True, 'type': bool, 'help': "Load the fine model on the GPU."}), - ('fine_use_small', {'value': False, 'type': bool, 'help': "Use a smaller/faster fine model."}), - ('codec_use_gpu', {'value': True, 'type': bool, 'help': "Load the codec model on the GPU."}), - ('force_reload', {'value': False, 'type': bool, 'help': "Force the models to be downloaded again."}), + ('text_use_gpu', {'value': True, 'type': bool, + 'help': "Load the text model on the GPU."}), + ('text_use_small', {'value': False, 'type': bool, + 'help': "Use a smaller/faster text model."}), + ('coarse_use_gpu', {'value': True, 'type': bool, + 'help': "Load the coarse model on the GPU."}), + ('coarse_use_small', {'value': False, 'type': bool, + 'help': "Use a smaller/faster coarse model."}), + ('fine_use_gpu', {'value': True, 'type': bool, + 'help': "Load the fine model on the GPU."}), + ('fine_use_small', {'value': False, 'type': bool, + 'help': "Use a smaller/faster fine model."}), + ('codec_use_gpu', {'value': True, 'type': bool, + 'help': "Load the codec model on the GPU."}), + ('force_reload', {'value': False, 'type': bool, + 'help': "Force the models to be downloaded again."}), ] + def preload_models_gradio(text_use_gpu, text_use_small, coarse_use_gpu, coarse_use_small, fine_use_gpu, fine_use_small, codec_use_gpu, force_reload): print("Preloading models...") generation.preload_models( @@ -284,33 +303,37 @@ def preload_models_gradio(text_use_gpu, text_use_small, coarse_use_gpu, coarse_u force_reload=force_reload, ) -def generate_speaker_variations(variation_path, variation_count): +def generate_speaker_variations(variation_path, variation_count): # I need to actually read how Gradio is supposed to work... why is this a float? if variation_count is not None and variation_count != '': variation_count = int(variation_count) print(f"Generating {variation_count} for speakers {variation_path}...") - api.render_npz_samples(npz_directory=variation_path,gen_minor_variants=variation_count) + api.render_npz_samples(npz_directory=variation_path, + gen_minor_variants=variation_count) return + def generate_sample_audio(sample_gen_path): print("Generating sample audio...") api.render_npz_samples(npz_directory=sample_gen_path) return + def sent_bark_envs(env_config_group): OFFLOAD_CPU = "OFFLOAD_CPU" in env_config_group USE_SMALL_MODELS = "USE_SMALL_MODELS" in env_config_group GLOBAL_ENABLE_MPS = "GLOBAL_ENABLE_MPS" in env_config_group - - print(f"Setting these envs: OFFLOAD_CPU={OFFLOAD_CPU}, USE_SMALL_MODELS={USE_SMALL_MODELS}, GLOBAL_ENABLE_MPS={GLOBAL_ENABLE_MPS}") + print( + f"Setting these envs: OFFLOAD_CPU={OFFLOAD_CPU}, USE_SMALL_MODELS={USE_SMALL_MODELS}, GLOBAL_ENABLE_MPS={GLOBAL_ENABLE_MPS}") generation.OFFLOAD_CPU = OFFLOAD_CPU generation.USE_SMALL_MODELS = USE_SMALL_MODELS generation.GLOBAL_ENABLE_MPS = GLOBAL_ENABLE_MPS + def set_loglevel(loglevel): if loglevel is not None and loglevel != '': @@ -318,16 +341,16 @@ def set_loglevel(loglevel): logger.setLevel(loglevel) - - - def generate_gradio_widgets(options): widgets = [] for option_name, option_info in options: if option_info['type'] == bool: - checkbox = gr.Checkbox(label=option_name, value=option_info['value'], info=option_info['help']) + checkbox = gr.Checkbox( + label=option_name, value=option_info['value'], info=option_info['help']) widgets.append(checkbox) return widgets + + generated_widgets = generate_gradio_widgets(model_options) @@ -345,9 +368,10 @@ def format_defaults(defaults): formatted_text += "\n" return formatted_text + formatted_defaults = format_defaults(config.DEFAULTS) -with gr.Blocks(theme=default_theme,css=bark_console_style) as demo: +with gr.Blocks(theme=default_theme, css=bark_console_style) as demo: gr.Markdown( """ # Bark Infinity "Command Line" @@ -356,80 +380,81 @@ def format_defaults(defaults): """ ) - with gr.Row(): with gr.Column(variant="panel", scale=0.5): gr.Markdown("""### ๐Ÿถ Main Bark Input""") - input = gr.TextArea(placeholder="Text Prompt", label="Text your want to Bark to try and turn into sound goes here.", info="The text will be split into smaller chunks on the right.") + input = gr.TextArea(placeholder="Text Prompt", label="Text your want to Bark to try and turn into sound goes here.", + info="The text will be split into smaller chunks on the right.") with gr.Column(variant="panel", scale=0.5): gr.Markdown("""### โŒจ๏ธ Bark 'Console' With hoarder_mode๐Ÿ’Ž๐Ÿ’Ž, every segment of every clip is saved as a seperate voice files. You'll hard drive will be full, but you won't miss that golden sample.""") - - output = gr.HTML(elem_classes ="bark_console", interactive=True) + output = gr.HTML(elem_classes="bark_console", interactive=True) + def clear_logs(): with open("gradio_terminal_ouput.log", "w") as f: f.write("") - + clear_button = gr.Button("Clear Log") clear_button.click(clear_logs) - - with gr.Tab("Main Options"): with gr.Row(): with gr.Column(variant="panel", scale=1): - gr.Markdown("""### ๐Ÿง‘โ€๐ŸŽค These are installed speakers in bark/assets/prompts/ or custom_speakers/""") + gr.Markdown( + """### ๐Ÿง‘โ€๐ŸŽค These are installed speakers in bark/assets/prompts/ or custom_speakers/""") - npz_dropdown = create_npz_dropdown(directories, label="Speaker") + npz_dropdown = create_npz_dropdown( + directories, label="Speaker") with gr.Column(variant="panel", scale=1): - gr.Markdown("""### ๐Ÿ‘ฉโ€๐ŸŽค๐ŸŽ™๏ธ These are NEW voices you create when you use a random voice, in your output directory""") - generated_voices = create_npz_dropdown(outputs_dirs, label="Generated Speaker") + gr.Markdown( + """### ๐Ÿ‘ฉโ€๐ŸŽค๐ŸŽ™๏ธ These are NEW voices you create when you use a random voice, in your output directory""") + generated_voices = create_npz_dropdown( + outputs_dirs, label="Generated Speaker") with gr.Row(): with gr.Column(variant="panel", scale=0.25): - split_character_goal_length = gr.Slider(label="Aim for this many characters in each clip", value=110, maximum=300, step=1) - split_character_max_length = gr.Slider(label="Never go higher than this many characters", value=170, maximum=300, step=1) - dry_run = gr.Checkbox(label="โœ‚๏ธโœ‚๏ธJust show me how you would split this text, don't actually run Bark.", value=False) - text_temp = gr.Slider(label="text_temp", minimum=0.0, maximum=1.0, value = 0.7, interactive = True) - waveform_temp = gr.Slider(label="waveform", minimum=0.0, maximum=1.0, value=0.7, interactive=True) + split_character_goal_length = gr.Slider( + label="Aim for this many characters in each clip", value=110, maximum=300, step=1) + split_character_max_length = gr.Slider( + label="Never go higher than this many characters", value=170, maximum=300, step=1) + dry_run = gr.Checkbox( + label="โœ‚๏ธโœ‚๏ธJust show me how you would split this text, don't actually run Bark.", value=False) + text_temp = gr.Slider( + label="text_temp", minimum=0.0, maximum=1.0, value=0.7, interactive=True) + waveform_temp = gr.Slider( + label="waveform", minimum=0.0, maximum=1.0, value=0.7, interactive=True) with gr.Column(variant="panel", scale=0.25): m("# Joining Segments:") - stable_mode_interval = gr.Dropdown(["Continuous", "Stable", "2","3","4","5"], label="How to Join Clips:", info=">1 means feedback X times, then reset back to the original stable speaker.", value="Stable") - - - - semantic_min_eos_p = gr.Slider(label="semantic_min_eos_p", minimum=0.0, maximum=1.0, value=0.2, interactive=True, info="If you're getting extra words at the end of your clisp, try 0.10 or 0.05 here.") - - + stable_mode_interval = gr.Dropdown(["Continuous", "Stable", "2", "3", "4", "5"], label="How to Join Clips:", + info=">1 means feedback X times, then reset back to the original stable speaker.", value="Stable") + semantic_min_eos_p = gr.Slider(label="semantic_min_eos_p", minimum=0.0, maximum=1.0, value=0.2, + interactive=True, info="If you're getting extra words at the end of your clisp, try 0.10 or 0.05 here.") with gr.Column(variant="panel", scale=0.25): - add_silence_between_segments = gr.Slider(label="Silence Between Segment", minimum=0.0, maximum=5.0, value=0.0, interactive=True, info="Add a bit of silence between joined audio segments.") - - confused_travolta_mode = gr.Checkbox(label="๐Ÿ•บ๐Ÿ•บ Confused Mode", value=False) - - hoarder_mode = gr.Checkbox(label="๐Ÿ’Ž๐Ÿ’ŽSave all files for every segment. Recommended", value=False) + add_silence_between_segments = gr.Slider(label="Silence Between Segment", minimum=0.0, maximum=5.0, + value=0.0, interactive=True, info="Add a bit of silence between joined audio segments.") + confused_travolta_mode = gr.Checkbox( + label="๐Ÿ•บ๐Ÿ•บConfused Mode", value=False) + hoarder_mode = gr.Checkbox( + label="๐Ÿ’Ž๐Ÿ’ŽSave all files for every segment. Recommended", value=False) with gr.Column(variant="panel", scale=0.25): - output_dir = gr.Textbox(label="Output directory", value="bark_samples/") - seed = gr.Textbox(label="Random SEED", value="", info="Set one time, at start.") - output_iterations = gr.Textbox(label="Repeat This many Times", value="") - - - - - - - + output_dir = gr.Textbox( + label="Output directory", value="bark_samples/") + seed = gr.Textbox(label="Random SEED", value="", + info="Set one time, at start.") + output_iterations = gr.Textbox( + label="Repeat This many Times", value="") with gr.Tab("Setup Model Options or Preload (Optional, Except for Apple)"): with gr.Row(): @@ -438,89 +463,109 @@ def clear_logs(): with gr.Column(scale=.25, variant="panel"): model_checkboxes = generate_gradio_widgets(model_options) model_button = gr.Button("Preload These Models") - model_button.click(preload_models_gradio, inputs=model_checkboxes) + model_button.click(preload_models_gradio, + inputs=model_checkboxes) with gr.Column(scale=.25, variant="panel"): - gr.Markdown("If you have 10GB of VRAM and want to keep all the big models in your GPU memory memory for maximum speed, set this parameter.") - env_config_vars = ["OFFLOAD_CPU", "USE_SMALL_MODELS", "GLOBAL_ENABLE_MPS"] + gr.Markdown( + "If you have 10GB of VRAM and want to keep all the big models in your GPU memory memory for maximum speed, set this parameter.") + env_config_vars = ["OFFLOAD_CPU", + "USE_SMALL_MODELS", "GLOBAL_ENABLE_MPS"] env_config_values = ["True", "False", "False"] - env_config_group= gr.CheckboxGroup(choices=env_config_vars, value=env_config_values, label="System Wide Config Settings", type="value", interactive=True, visible=True) + env_config_group = gr.CheckboxGroup(choices=env_config_vars, value=env_config_values, + label="System Wide Config Settings", type="value", interactive=True, visible=True) env_button = gr.Button("Set these parameters") - - env_button.click(sent_bark_envs, inputs=env_config_group) + + env_button.click(sent_bark_envs, inputs=env_config_group) with gr.Tab("Tools"): with gr.Row(): with gr.Column(scale=.25): m("### Generate a sample audio clip for each speaker file in a directory. Very fast.") - sample_gen_path = gr.Textbox(label="Sample Directory", value="bark/assets/prompts/v2") - sample_gen_button = gr.Button("Gen Voice Samples", variant="primary") - sample_gen_button.click(generate_sample_audio, inputs=sample_gen_path) + sample_gen_path = gr.Textbox( + label="Sample Directory", value="bark/assets/prompts/v2") + sample_gen_button = gr.Button( + "Gen Voice Samples", variant="primary") + sample_gen_button.click( + generate_sample_audio, inputs=sample_gen_path) with gr.Column(scale=.25): - gr.Markdown("### Generate minor variations on existing speaker files.") - gr.Markdown("This is much slower, do don't pick a big directory") - gr.Markdown("Try puttting one file in a directory by itself to test first.") - gr.Markdown("This version leaves the semantic prompt alone, so the variations are pretty minor.") - variation_path = gr.Textbox(label="Speaker Variation Directory", value="bark/assets/prompts/v2") - variation_count = gr.Number(label="How Many Variations", value=3) - generate_speaker_variations_button = gr.Button("Generate Voice Variations", variant="primary") - - generate_speaker_variations_button.click(generate_speaker_variations,inputs=[variation_path, variation_count]) + gr.Markdown( + "### Generate minor variations on existing speaker files.") + gr.Markdown( + "This is much slower, do don't pick a big directory") + gr.Markdown( + "Try puttting one file in a directory by itself to test first.") + gr.Markdown( + "This version leaves the semantic prompt alone, so the variations are pretty minor.") + variation_path = gr.Textbox( + label="Speaker Variation Directory", value="bark/assets/prompts/v2") + variation_count = gr.Number( + label="How Many Variations", value=3) + generate_speaker_variations_button = gr.Button( + "Generate Voice Variations", variant="primary") + + generate_speaker_variations_button.click(generate_speaker_variations, inputs=[ + variation_path, variation_count]) with gr.Column(scale=.25): - loglevel = gr.Dropdown(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], label="# Do you like like logs? Y/N", info="DEBUG = Drown in Text") + loglevel = gr.Dropdown(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + label="# Do you like like logs? Y/N", info="DEBUG = Drown in Text") loglevel_button = gr.Button("Set Log Level") loglevel_button.click(set_loglevel, inputs=loglevel) with gr.Tab("Even More Options"): with gr.Row(): - with gr.Column(scale=.33, variant="panel"): - m("# You might not have asked for a command line interface in your Gradio app, but it sure beats me making 80 more checkboxes.") - m("Some of these options even work. Type them like you would on a command line.") - m("```--semantic_top_k 50```") - m("```--semantic_min_eos_p 0.05```") - - with gr.Column(scale=.33, variant="panel"): - m("### ๏ฟฝ๏ฟฝ๏ฟฝ๏ฟฝ๐Ÿ๐Ÿ๏ฟฝ Raw list of some advanced options that may or may not be implemented or working.") - gr.HTML(f"{formatted_defaults}",elem_classes ="bark_console", info=". I cut a lot of these out becaus they were buggy or took too long to try and merge with regular Bark because I don't really understand the stuff I poke at very well.") - with gr.Column(scale=.33, variant="panel"): - - extra_args_input = gr.TextArea(lines=15, label="Extra Arguments", elem_classes ="bark_console") - - + with gr.Column(scale=.33, variant="panel"): + m("# You might not have asked for a command line interface in your Gradio app, but it sure beats me making 80 more checkboxes.") + m("Some of these options even work. Type them like you would on a command line.") + m("```--semantic_top_k 50```") + m("```--semantic_min_eos_p 0.05```") + + with gr.Column(scale=.33, variant="panel"): + m("### ๐Ÿ๐Ÿ Raw list of some advanced options that may or may not be implemented or working.") + gr.HTML(f"{formatted_defaults}", elem_classes="bark_console", + info=". I cut a lot of these out becaus they were buggy or took too long to try and merge with regular Bark because I don't really understand the stuff I poke at very well.") + with gr.Column(scale=.33, variant="panel"): + + extra_args_input = gr.TextArea( + lines=15, label="Extra Arguments", elem_classes="bark_console") + with gr.Tab("๐ŸŽค Clone a Voice? ๐Ÿคท"): # Copied from https://github.com/serp-ai/bark-with-voice-clone and https://github.com/C0untFloyd/bark-gui, haven't really got anything useful from it so far. with gr.Row(): with gr.Column(scale=1, variant="panel"): gr.Markdown("### ๐ŸŽค Clone a Voice???") - gr.Markdown("This code is from https://github.com/serp-ai/bark-with-voice-clone and from https://github.com/C0untFloyd") - gr.Markdown("The only thing I did was have it spit out a gen multiple variants voices rather than one.") + gr.Markdown( + "This code is from https://github.com/serp-ai/bark-with-voice-clone and from https://github.com/C0untFloyd") + gr.Markdown( + "The only thing I did was have it spit out a gen multiple variants voices rather than one.") gr.Markdown("So far no luck, but I didn't experiment with it.") - input_audio_filename = gr.Audio(label="Input audio.wav", source="upload", type="filepath") - transcription_text = gr.Textbox(label="Transcription Text", lines=1, placeholder="Enter Text of your Audio Sample here...") + input_audio_filename = gr.Audio( + label="Input audio.wav", source="upload", type="filepath") + transcription_text = gr.Textbox( + label="Transcription Text", lines=1, placeholder="Enter Text of your Audio Sample here...") initialname = "ClonedVoice" - #inputAudioFilename = gr.Textbox(label="Filename of Input Audio", lines=1, placeholder="audio.wav") - output_voice = gr.Textbox(label="Filename of trained Voice", lines=1, placeholder=initialname, value=initialname) + # inputAudioFilename = gr.Textbox(label="Filename of Input Audio", lines=1, placeholder="audio.wav") + output_voice = gr.Textbox( + label="Filename of trained Voice", lines=1, placeholder=initialname, value=initialname) clone_voice_button = gr.Button("Create Voice") dummy = gr.Text(label="Progress") - clone_voice_button.click(clone_voice, inputs=[input_audio_filename, transcription_text, output_voice], outputs=dummy) - + clone_voice_button.click(clone_voice, inputs=[ + input_audio_filename, transcription_text, output_voice], outputs=dummy) - with gr.Row(): + with gr.Row(): with gr.Column(scale=1): btn = gr.Button("Generate!", variant="primary") - with gr.Column(scale=1): - - cancel_button = gr.Button("Cancel? (I couldn't get it to work without disconnecting the progress bars.)", label="(Cancel barely worked so I disabled it for now.)", variant="stop") + + cancel_button = gr.Button("Cancel? (I couldn't get it to work without disconnecting the progress bars.)", + label="(Cancel barely worked so I disabled it for now.)", variant="stop") cancel_button.click(cancel) with gr.Row(): - - audio_output = gr.Audio(label="Bark Sample", type="filepath") - + audio_output = gr.Audio(label="Bark Sample", type="filepath") theme_selector = gr.Radio( ["Base", "Default", "Monochrome", "Soft", "Glass"], @@ -568,18 +613,12 @@ def clear_logs(): """, ) - - - btn.click(generate_audio_long_gradio,inputs=[input, npz_dropdown, generated_voices,confused_travolta_mode,stable_mode_interval,split_character_goal_length,split_character_max_length, seed, dry_run, output_iterations, hoarder_mode, text_temp, waveform_temp,semantic_min_eos_p, output_dir, add_silence_between_segments, extra_args_input], outputs=[audio_output]) - - + btn.click(generate_audio_long_gradio, inputs=[input, npz_dropdown, generated_voices, confused_travolta_mode, stable_mode_interval, split_character_goal_length, split_character_max_length, + seed, dry_run, output_iterations, hoarder_mode, text_temp, waveform_temp, semantic_min_eos_p, output_dir, add_silence_between_segments, extra_args_input], outputs=[audio_output]) logs = gr.HTML() # this is crazy right? nobody should have to do this to show text output to Gradio? demo.load(read_logs, None, output, every=1) - - demo.queue().launch() -