diff --git a/.gitignore b/.gitignore index 8fcacb85..0a90f594 100644 --- a/.gitignore +++ b/.gitignore @@ -177,4 +177,3 @@ files/ # Generated Config files. examples/**/*.yaml **/config.yaml -examples/language_modeling/scripts diff --git a/examples/language_modeling/README.md b/examples/language_modeling/README.md new file mode 100644 index 00000000..201d699f --- /dev/null +++ b/examples/language_modeling/README.md @@ -0,0 +1,21 @@ +# Training data attribution with language modeling task +This directory contains the codes for running the training data attribution with large scale language models like LLAMA3. In essence, the code ranks the pretraining data based on the importance of each data point in the generation of a target sentence. The procedure to rank the data points is as follows: +1. **Data Preparation**: Generate the model outputs that we will analyze. This is a simple code that generates the output based on the prompt. We experimented with `Meta-Llama-3-8B-Instruct`, `pythia-1.4b` and `gpt2-xl`. Use `generate_llama3.py` for `Meta-Llama-3-8B-Instruct` and `generate.py` for `pythia-1.4b` and `gpt2-xl`. +```python +python generate_llama3.py +python generate.py +``` + +2. **Extract Log**: `extract_log.py` extracts training gradients for each pretraining data point, compresses them using LoGra, and saves them in files. Note that by default we use 1B tokens from `openwebtext` data, leveraging data parallelism. An example running command is as follows (the actual command used for the paper could be found in `scripts` folder). This is the most time consuming part of the pipeline. +```python +accelerate launch --num_processes 2 --num_machines 1 --multi_gpu extract_log.py --model_name meta-llama/Meta-Llama-3-8B-Instruct --lora random --hessian raw --mlp_only --data_name openwebtext +``` +As a result, the code will generate a folder containing the compressed gradients for each data point and other statistics necessary for running LoGra (e.g. the random initialization of LoGra parameters, the covariance of the gradients, etc.). + +3. **Compute Influence function**: `compute_influence.py` computes the influence score for each data point, using the compressed gradient we just generated. The specified query data (`data_name`) is used to compute the query gradient. As we have already saved (preconditioned) the training gradients, this is a relatively fast process. +```python +python compute_influence.py --model_name meta-llama/Meta-Llama-3-8B-Instruct --lora random --hessian raw --split generated --mlp_only --data_name openwebtext --mode cosine +``` + +4. `Analysis`: Finally, we also include a minimal analysis code that extracts the top-k most influential data points and saves them in a file. This code is `analysis.py`. +```python diff --git a/examples/language_modeling/analysis.py b/examples/language_modeling/analysis.py new file mode 100755 index 00000000..d0893fdc --- /dev/null +++ b/examples/language_modeling/analysis.py @@ -0,0 +1,34 @@ +import os + +import torch + + +k = 20 +experiment = "pythia-1.4b_random_raw_openwebtext_mlp/generated" +mode = "cosine" + +scores = torch.load(os.path.join(experiment, f"scores_{mode}.pt"), map_location="cpu") +train_ids = torch.load( + os.path.join(experiment, f"train_ids_{mode}.pt"), map_location="cpu" +) +test_ids = torch.load( + os.path.join(experiment, f"test_ids_{mode}.pt"), map_location="cpu" +) +print(len(train_ids), len(test_ids), scores.shape) +assert len(train_ids) == scores.shape[1] +assert len(test_ids) == scores.shape[0] + +out = "" +for idx, test_id in enumerate(test_ids): + out += "==========================================================\n" + out += f"Query: {test_id}\n" + out += "==========================================================\n" + topk_indices = torch.topk(scores[idx], k=k)[1] + for j, topk_idx in enumerate(topk_indices): + score = scores[idx][topk_idx] + train_id = train_ids[topk_idx] + out += f"Top {j + 1} (score: {score})]: {train_id}\n" + out += "==========================================================\n" + +with open(os.path.join(experiment, f"top_{mode}.txt"), "w") as file: + file.write(out) diff --git a/examples/language_modeling/compute_influence.py b/examples/language_modeling/compute_influence.py index b36efb20..05262b59 100644 --- a/examples/language_modeling/compute_influence.py +++ b/examples/language_modeling/compute_influence.py @@ -1,17 +1,15 @@ -import os -import copy import argparse +import copy +import os +import logix import torch import torch.nn.functional as F from accelerate import Accelerator -import logix -from logix.analysis import InfluenceFunction from logix.utils import merge_logs from tqdm import tqdm -from utils import get_model, get_tokenizer, get_loader, set_seed - +from utils import get_loader, get_model, get_tokenizer, set_seed if torch.cuda.is_available(): torch.backends.cuda.matmul.allow_tf32 = True @@ -20,55 +18,37 @@ def main(): parser = argparse.ArgumentParser("GPT2 Influence Analysis") parser.add_argument("--config_path", type=str, default="./config.yaml") - parser.add_argument( - "--cache_dir", - type=str, - default="/data/tir/projects/tir3/users/sangkeuc/huggingface", - ) - parser.add_argument( - "--save_dir", - type=str, - default="/data/tir/projects/tir3/users/sangkeuc/gpt/results", - ) - parser.add_argument("--model_name", type=str, default="gpt2-xl") - parser.add_argument("--data_path", type=str, default="wikitext") - parser.add_argument("--data_name", type=str, default=None) - parser.add_argument("--batch_size", type=int, default=1) - parser.add_argument("--hessian", type=str, default="raw") + parser.add_argument("--cache_dir", type=str, default=None) + parser.add_argument("--model_name", type=str, default="gpt2") + parser.add_argument("--hessian", type=str, default="kfac") parser.add_argument("--lora", type=str, default="random") - parser.add_argument("--split", type=str, default="train") + parser.add_argument("--split", type=str, default="valid") parser.add_argument("--mlp_only", action="store_true") - parser.add_argument("--layerwise", action="store_true") + parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--damping", type=float, default=1e-5) + parser.add_argument("--data_name", type=str, default="openwebtext") + parser.add_argument("--mode", type=str, default="dot") args = parser.parse_args() set_seed(0) accelerator = Accelerator() - influence_groups = None - if args.layerwise: - layer_id = "h" if args.model_name == "gpt2-xl" else "layers" - layer_num = 48 if args.model_name == "gpt2-xl" else 32 - influence_groups = [f".{layer_id}.{i}." for i in range(layer_num)] # prepare model & data loader model = get_model(model_name=args.model_name, cache_dir=args.cache_dir) - tokenizer = get_tokenizer( - model_name=args.model_name, cache_dir=args.cache_dir, add_padding_token=True - ) + tokenizer = get_tokenizer(model_name=args.model_name, cache_dir=args.cache_dir) data_loader = get_loader( model_name=args.model_name, - data_path=args.data_path, - data_name=args.data_name, tokenizer=tokenizer, batch_size=args.batch_size, cache_dir=args.cache_dir, split=args.split, + data_name=args.data_name, ) model, data_loader = accelerator.prepare(model, data_loader) # Set-up LogIX model_name_strip = args.model_name.split("/")[-1] - project = f"{model_name_strip}_{args.lora}_{args.hessian}" + project = f"{model_name_strip}_{args.lora}_{args.hessian}_{args.data_name}" name_filter = ["att", "mlp"] if args.mlp_only: project += "_mlp" @@ -82,6 +62,9 @@ def main(): # Influence analysis logix.setup({"log": "grad"}) logix.eval() + if_scores = [] + train_ids = None + test_ids = [] merged_test_logs = [] for idx, batch in enumerate(tqdm(data_loader)): data_id = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True) @@ -103,23 +86,30 @@ def main(): test_log = logix.get_log() merged_test_logs.append(copy.deepcopy(test_log)) - if idx == 12 or idx == len(data_loader) - 1: + if idx == len(data_loader) - 1: merged_test_log = merge_logs(merged_test_logs) - result = run.influence.compute_influence_all( - merged_test_log, log_loader, influence_groups=influence_groups + if_score, train_ids_batch = run.influence.compute_influence_all( + merged_test_log, + log_loader, + mode=args.mode, ) + if_scores.append(if_score) + if train_ids is None: + train_ids = train_ids_batch + else: + assert train_ids == train_ids_batch + test_ids.extend(merged_test_log[0]) + if_scores = torch.cat(if_scores, dim=0) merged_test_logs = [] break - post_fix = f"{args.split}_{model_name_strip}_{args.lora}_{args.hessian}" - if args.mlp_only: - post_fix += "_mlp" - save_dir = os.path.join(args.save_dir, post_fix) + base_dir = os.path.dirname(os.path.abspath(__file__)) # current file's directory + save_dir = os.path.join(base_dir, project, f"{args.split}") if not os.path.exists(save_dir): os.makedirs(save_dir) - torch.save(result["influence"], os.path.join(save_dir, "scores.pt")) - torch.save(result["src_ids"], os.path.join(save_dir, "test_ids.pt")) - torch.save(result["tgt_ids"], os.path.join(save_dir, "train_ids.pt")) + torch.save(if_scores, os.path.join(save_dir, f"scores_{args.mode}.pt")) + torch.save(train_ids, os.path.join(save_dir, f"train_ids_{args.mode}.pt")) + torch.save(test_ids, os.path.join(save_dir, f"test_ids_{args.mode}.pt")) if __name__ == "__main__": diff --git a/examples/language_modeling/extract_log.py b/examples/language_modeling/extract_log.py index 8f51be40..0d5ee39d 100644 --- a/examples/language_modeling/extract_log.py +++ b/examples/language_modeling/extract_log.py @@ -1,16 +1,12 @@ -import os import argparse -from tqdm import tqdm +import logix import torch import torch.nn.functional as F from accelerate import Accelerator -from accelerate.utils import GradScalerKwargs -import logix -from logix.statistic import Covariance - -from utils import get_model, get_tokenizer, get_loader, set_seed +from tqdm import tqdm +from utils import get_loader, get_model, get_tokenizer, set_seed # Enable TF32 if possible if torch.cuda.is_available(): @@ -20,21 +16,15 @@ def main(): parser = argparse.ArgumentParser("GPT2 Influence Analysis") parser.add_argument("--config_path", type=str, default="./config.yaml") - parser.add_argument( - "--cache_dir", - type=str, - default="/data/tir/projects/tir3/users/sangkeuc/huggingface", - ) + parser.add_argument("--cache_dir", type=str, default=None) parser.add_argument("--model_name", type=str, default="gpt2") - parser.add_argument("--data_path", type=str, default="wikitext") - parser.add_argument("--data_name", type=str, default=None) parser.add_argument("--batch_size", type=int, default=8) - parser.add_argument("--hessian", type=str, default="raw") + parser.add_argument("--hessian", type=str, default="kfac") parser.add_argument("--lora", type=str, default="random") parser.add_argument("--save", type=str, default="grad") parser.add_argument("--mlp_only", action="store_true") + parser.add_argument("--data_name", type=str, default="openwebtext") args = parser.parse_args() - print(args) set_seed(0) accelerator = Accelerator() @@ -44,16 +34,15 @@ def main(): tokenizer = get_tokenizer(model_name=args.model_name, cache_dir=args.cache_dir) data_loader = get_loader( model_name=args.model_name, - data_path=args.data_path, - data_name=args.data_name, tokenizer=tokenizer, batch_size=args.batch_size, cache_dir=args.cache_dir, + data_name=args.data_name, ) # LogIX Setup model_name_strip = args.model_name.split("/")[-1] - project = f"{model_name_strip}_{args.lora}_{args.hessian}" + project = f"{model_name_strip}_{args.lora}_{args.hessian}_{args.data_name}" name_filter = ["att", "mlp"] if args.mlp_only: project += "_mlp" @@ -87,6 +76,7 @@ def main(): ) accelerator.backward(loss) logix.finalize() + print(f"Log saved in {project}") if __name__ == "__main__": diff --git a/examples/language_modeling/generate.py b/examples/language_modeling/generate.py index 160bd250..993853eb 100755 --- a/examples/language_modeling/generate.py +++ b/examples/language_modeling/generate.py @@ -1,39 +1,166 @@ -import transformers +import argparse +import csv +import json +import os + import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + + +def get_prompt(prompt=None): + if prompt is None: + prompt = [ + "How can we reduce air pollution?", + "Discuss the causes of the Great Depression", + "Propose an ethical solution to the problem of data privacy", + "Generate a poem that expresses joy.", + "Design an app for a delivery company.", + "Generate a pitch for a new and original product.", + "Write a short review for the novel 'The Catcher in the Rye'.", + "What is the process of photosynthesis and why is it important?", + "Explain the difference between HTML and CSS.", + "What is the difference between machine learning and deep learning?", + "Brainstorm creative ideas for designing a conference room.", + "Generate a list of business ideas for a food delivery service.", + "Compose a tweet that addresses the issue of environmental degradation.", + "What is the process of photosynthesis and why is it important?", + "Explain why computational models can be used in analysing text.", + "Follow the law of supply and demand, describe what would happen to the price of a good if the demand increased.", + "Generate a possible future for humankind if artificial intelligence (AI) becomes more advanced and prevalent.", + "Suppose you are managing a marketing campaign. What are some methods you can use to measure the success of the campaign?", + "Now that you know the different ways to say hello in French, which one would you use if you were greeting a friend?", + "Write a short story summarizing the following events: (events) An engineer discovers a new form of energy, but it requires a large amount of money to develop.", + "Generate a thesis statement based on the following description. Description: The key principles behind effective physical exercise routines including warming up, cooling down, and rest recovery.", + "Evaluate the quality of the following sentence: 'To get a better understanding of the material we have been studying, going through examples is a good choice.'", + "Identify the main idea of the following paragraph: The driving force behind the success of companies today is their ability to be innovative and adaptive in a constantly changing environment.", + ] + + return [prompt] if isinstance(prompt, str) else prompt + + +# the model generates \n after the prompt when batch generating +def batch_generate( + model, + tokenizer, + prompts=None, + device=None, + top_k=1, + top_p=0.95, + temperature=0.9, + repetition_penalty=1.5, +): + if prompts is None: + prompts = get_prompt() + + if device is None: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + tokenizer.pad_token = tokenizer.eos_token + inputs = tokenizer( + prompts, + return_tensors="pt", + return_token_type_ids=False, + padding=True, + truncation=True, + ) + inputs = inputs.to(device) + response = model.generate( + **inputs, + max_new_tokens=args.maxlen, + do_sample=True, + top_k=top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.eos_token_id, + ) + generated = tokenizer.batch_decode(response, skip_special_tokens=True) + return prompts, generated + + +def iterative_generate( + model, + tokenizer, + prompts=None, + device=None, + top_k=1, + top_p=0.95, + temperature=0.9, + repetition_penalty=1.5, +): + if prompts is None: + prompts = get_prompt() + + if device is None: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + generated = [] + for p in prompts: + _, g = batch_generate( + model, + tokenizer, + p, + device, + top_k, + top_p, + temperature, + repetition_penalty, + ) + generated.append(g[0]) + return prompts, generated + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="gpt2 Generation") + parser.add_argument("--model_name", type=str, default="gpt2-xl") + parser.add_argument("--cache_dir", type=str, default=None) + parser.add_argument("--prompt", type=str, default=None) + parser.add_argument("--topk", type=int, default=50) + parser.add_argument("--topp", type=float, default=0.95) + parser.add_argument("--temperature", type=float, default=0.3) + parser.add_argument("--maxlen", type=float, default=256) + parser.add_argument("--repetition-penalty", type=float, default=1.5) + parser.add_argument("--seed", type=int, default=0) + args = parser.parse_args() + + torch.random.manual_seed(args.seed) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = AutoModelForCausalLM.from_pretrained( + args.model_name, + cache_dir=args.cache_dir, + ) + tokenizer = AutoTokenizer.from_pretrained( + args.model_name, + cache_dir=args.cache_dir, + ) + model.eval() + model.to(device) + # print(tokenizer.eos_token_id, tokenizer.pad_token_id) + + prompts = get_prompt(args.prompt) + prompts, generated = iterative_generate( + model, + tokenizer, + prompts, + top_k=args.topk, + top_p=args.topp, + temperature=args.temperature, + repetition_penalty=args.repetition_penalty, + ) -model_id = "meta-llama/Meta-Llama-3-8B-Instruct" - -pipeline = transformers.pipeline( - "text-generation", - model=model_id, - model_kwargs={"torch_dtype": torch.bfloat16, "cache_dir": "./cache"}, - device="cuda", -) - -messages = [ - { - "role": "system", - "content": "You are a pirate chatbot who always responds in pirate speak!", - }, - {"role": "user", "content": "Give three tips for staying healthy."}, -] - -prompt = pipeline.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True -) - -terminators = [ - pipeline.tokenizer.eos_token_id, - pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>"), -] - -outputs = pipeline( - prompt, - max_new_tokens=256, - eos_token_id=terminators, - do_sample=True, - temperature=0.6, - top_p=0.9, -) -print(outputs[0]["generated_text"]) -# print(outputs[0]["generated_text"][len(prompt):]) + # write prompt and generation into a csv file + model_name = args.model_name.split("/")[-1] + out_file = f"./custom_data/generated/{model_name}/data.json" + os.makedirs(os.path.dirname(out_file), exist_ok=True) + data_json = [ + { + "prompt": p, + "text": g, + } + for p, g in zip(prompts, generated) + ] + with open(out_file, "w") as f: + json.dump(data_json, f, indent=4) + print(f"Generated data saved to {out_file}") diff --git a/examples/language_modeling/generate_llama3.py b/examples/language_modeling/generate_llama3.py new file mode 100755 index 00000000..62d68b4d --- /dev/null +++ b/examples/language_modeling/generate_llama3.py @@ -0,0 +1,63 @@ +import json + +import transformers +import torch + +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" + +pipeline = transformers.pipeline( + "text-generation", + model=model_id, + model_kwargs={"torch_dtype": torch.bfloat16}, + device="cuda", +) + +prompts = [ + "How can we reduce air pollution?", + "Discuss the causes of the Great Depression", + "Propose an ethical solution to the problem of data privacy", + "Generate a poem that expresses joy.", + "Design an app for a delivery company.", + "Generate a pitch for a new and original product.", + "Write a short review for the novel 'The Catcher in the Rye'.", + "What is the process of photosynthesis and why is it important?", + "Explain the difference between HTML and CSS.", + "What is the difference between machine learning and deep learning?", + "Brainstorm creative ideas for designing a conference room.", +] + + +prompt_list = [] +output_list = [] +for p in prompts: + messages = [ + {"role": "user", "content": p}, + ] + + prompt = pipeline.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + terminators = [ + pipeline.tokenizer.eos_token_id, + pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>"), + ] + + outputs = pipeline( + prompt, + max_new_tokens=256, + eos_token_id=terminators, + do_sample=True, + temperature=0.6, + top_p=0.9, + ) + print(outputs[0]["generated_text"]) + # print(outputs[0]["generated_text"][len(prompt):]) + prompt_list.append(prompt) + output_list.append(outputs[0]["generated_text"]) + +model_name = model_id.split("/")[-1] +filename = f"custom_data/generated/{model_name}/data.json" +data = [{"prompt": p, "text": o} for p, o in zip(prompt_list, output_list)] +with open(filename, "w") as f: + json.dump(data, f, indent=4) diff --git a/examples/language_modeling/qualitative_analysis.py b/examples/language_modeling/qualitative_analysis.py deleted file mode 100644 index 423cd4d5..00000000 --- a/examples/language_modeling/qualitative_analysis.py +++ /dev/null @@ -1,48 +0,0 @@ -# usage: python qualitative_analysis.py --score_path if_logix.pt --score_path2 files/results/0/wiki_if.pt -import argparse -from transformers import AutoTokenizer - -import torch -from scipy.stats import pearsonr, spearmanr - -from utils import set_seed, get_loaders - - -parser = argparse.ArgumentParser("GPT2 Influence Score qualtitative analysis") -parser.add_argument("--score_path", type=str) -parser.add_argument("--score_path2", type=str) -args = parser.parse_args() - -DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") -set_seed(0) - - -# data -_, eval_train_loader, test_loader = get_loaders( - valid_indices=list(range(128)), -) -tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True, trust_remote_code=True) - -scores = torch.load(args.score_path, map_location="cpu") -if args.score_path2 is not None: - scores2 = torch.load(args.score_path2, map_location="cpu") - - corr = [] - for s1, s2 in zip(scores, scores2): - r = pearsonr(s1, s2)[0] - corr.append(r) - print(f"Average correlation: {sum(corr) / len(corr)}") - -for i in range(16): - print("=" * 80) - print(f"{i}th data point") - sequence = tokenizer.decode(test_loader.dataset[i]["input_ids"]) - print(f"Sequence: {sequence}") - - print("Most influential data point") - rank = torch.argsort(scores[i], descending=True) - for j in range(3): - print(f"Rank {j} (score = {scores[i][rank[j]]})") - sent = tokenizer.decode(eval_train_loader.dataset[int(rank[j])]["input_ids"]) - print(f"Sentence: {sent}") - input() diff --git a/examples/language_modeling/scripts/alpaca.sh b/examples/language_modeling/scripts/alpaca.sh deleted file mode 100755 index 2e28d98a..00000000 --- a/examples/language_modeling/scripts/alpaca.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/bash -#SBATCH --job-name=pythia -#SBATCH --output /home/sangkeuc/logs/\%j.out -#SBATCH --err /home/sangkeuc/logs/\%j.err -#SBATCH --nodes=1 -#SBATCH --gres=gpu:A100_80GB:2 -#SBATCH --mem=160GB -#SBATCH --time 24:00:00 -#SBATCH --mail-user=sangkeuc@andrew.cmu.edu -#SBATCH --mail-type=START,END,FAIL -#SBATCH --partition=general - -source ~/.bashrc -conda init -conda activate analog - -#model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" - -set -x - -accelerate launch --num_processes 2 --num_machines 1 --multi_gpu --main_process_port 63252 extract_log.py --model_name tatsu-lab/alpaca --lora random --hessian raw --batch_size 2 -# python extract_log.py --model_name "$model_name" --lora random --hessian raw --batch_size 2 -# python compute_influence.py --model_name "$model_name" --lora random --hessian raw --split valid -# CUDA_VISIBLE_DEVICES=0 python compute_influence.py --model_name "$model_name" --lora random --hessian raw --split external -# CUDA_VISIBLE_DEVICES=0 python compute_influence.py --model_name "$model_name" --lora random --hessian raw --split generated diff --git a/examples/language_modeling/scripts/gpt2_xl.sh b/examples/language_modeling/scripts/gpt2_xl.sh new file mode 100755 index 00000000..c62896bf --- /dev/null +++ b/examples/language_modeling/scripts/gpt2_xl.sh @@ -0,0 +1,20 @@ +#!/usr/bin/bash +#SBATCH --job-name=if +#SBATCH --output /data/tir/projects/tir3/users/hahn2/logix/examples/language_modeling/slurm-out/\%j.out +#SBATCH --err /data/tir/projects/tir3/users/hahn2/logix/examples/language_modeling/slurm-out/\%j.err +#SBATCH --nodes=1 +#SBATCH --gres=gpu:A100_80GB:1 +#SBATCH --mem=256GB +#SBATCH --time 48:00:00 +#SBATCH --mail-user=hahn2@andrew.cmu.edu +#SBATCH --mail-type=START,END,FAIL +#SBATCH --partition=general + +source ~/.bashrc +conda init +conda activate if + +set -x + +# accelerate launch --num_processes 2 --num_machines 1 --multi_gpu --main_process_port 63221 extract_log.py --model_name gpt2-xl --lora random --hessian raw --batch_size 1 --mlp_only --cache_dir cache --data_name openwebtext +python compute_influence.py --model_name gpt2-xl --lora random --hessian raw --split generated --mlp_only --mode cosine --cache_dir cache --data_name openwebtext diff --git a/examples/language_modeling/scripts/llama.sh b/examples/language_modeling/scripts/llama.sh deleted file mode 100755 index 84242737..00000000 --- a/examples/language_modeling/scripts/llama.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/bash -#SBATCH --job-name=pythia -#SBATCH --output /home/sangkeuc/logs/\%j.out -#SBATCH --err /home/sangkeuc/logs/\%j.err -#SBATCH --nodes=1 -#SBATCH --gres=gpu:A100_80GB:2 -#SBATCH --mem=160GB -#SBATCH --time 24:00:00 -#SBATCH --mail-user=sangkeuc@andrew.cmu.edu -#SBATCH --mail-type=START,END,FAIL -#SBATCH --partition=general - -source ~/.bashrc -conda init -conda activate analog - -set -x - -accelerate launch --num_processes 2 --num_machines 1 --multi_gpu --main_process_port 63252 extract_log.py --model_name /data/models/huggingface/meta-llama/Llama-2-7b-hf --lora random --hessian raw --batch_size 2 -#python extract_log.py --model_name "$model_name" --lora random --hessian raw --batch_size 2 -#python compute_influence.py --model_name "$model_name" --lora random --hessian raw --split valid -#CUDA_VISIBLE_DEVICES=0 python compute_influence.py --model_name /data/models/huggingface/meta-llama/Llama-2-7b-hf --lora random --hessian raw --split external -#CUDA_VISIBLE_DEVICES=0 python compute_influence.py --model_name /data/models/huggingface/meta-llama/Llama-2-7b-hf --lora random --hessian raw --split generated diff --git a/examples/language_modeling/scripts/llama3.sh b/examples/language_modeling/scripts/llama3.sh index d20a1b13..ed24f5c7 100755 --- a/examples/language_modeling/scripts/llama3.sh +++ b/examples/language_modeling/scripts/llama3.sh @@ -1,22 +1,20 @@ #!/usr/bin/bash -#SBATCH --job-name=pythia -#SBATCH --output /home/sangkeuc/logs/\%j.out -#SBATCH --err /home/sangkeuc/logs/\%j.err +#SBATCH --job-name=if +#SBATCH --output /data/tir/projects/tir3/users/hahn2/logix/examples/language_modeling/slurm-out/\%j.out +#SBATCH --err /data/tir/projects/tir3/users/hahn2/logix/examples/language_modeling/slurm-out/\%j.err #SBATCH --nodes=1 -#SBATCH --gres=gpu:A100_80GB:1 -#SBATCH --mem=80GB +#SBATCH --gres=gpu:L40:1 +#SBATCH --mem=128GB #SBATCH --time 24:00:00 -#SBATCH --mail-user=sangkeuc@andrew.cmu.edu +#SBATCH --mail-user=hahn2@andrew.cmu.edu #SBATCH --mail-type=START,END,FAIL #SBATCH --partition=general source ~/.bashrc conda init -conda activate analog +conda activate if set -x -# accelerate launch --num_processes 2 --num_machines 1 --multi_gpu --main_process_port 63221 extract_log.py --model_name meta-llama/Meta-Llama-3-8B-Instruct --lora random --hessian raw --batch_size 1 --mlp_only -# python compute_influence.py --model_name meta-llama/Meta-Llama-3-8B-Instruct --lora random --hessian raw --split valid --mlp_only -python compute_influence.py --model_name meta-llama/Meta-Llama-3-8B-Instruct --lora random --hessian raw --split external --mlp_only -# python compute_influence.py --model_name meta-llama/Meta-Llama-3-8B-Instruct --lora random --hessian raw --split generated +# accelerate launch --num_processes 2 --num_machines 1 --multi_gpu --main_process_port 63221 extract_log.py --model_name meta-llama/Meta-Llama-3-8B-Instruct --lora random --hessian raw --batch_size 1 --mlp_only --data_name openwebtext +python compute_influence.py --model_name meta-llama/Meta-Llama-3-8B-Instruct --lora random --hessian raw --split generated --mlp_only --data_name openwebtext --mode cosine diff --git a/examples/language_modeling/scripts/mistral.sh b/examples/language_modeling/scripts/mistral.sh deleted file mode 100755 index 2cccaaaa..00000000 --- a/examples/language_modeling/scripts/mistral.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/bash -#SBATCH --job-name=pythia -#SBATCH --output /home/sangkeuc/logs/\%j.out -#SBATCH --err /home/sangkeuc/logs/\%j.err -#SBATCH --nodes=1 -#SBATCH --gres=gpu:A100_80GB:1 -#SBATCH --mem=80GB -#SBATCH --time 24:00:00 -#SBATCH --mail-user=sangkeuc@andrew.cmu.edu -#SBATCH --mail-type=START,END,FAIL -#SBATCH --partition=general - -source ~/.bashrc -conda init -conda activate analog - -set -x - -# accelerate launch --num_processes 2 --num_machines 1 --multi_gpu --main_process_port 63252 extract_log.py --model_name mistralai/Mistral-7B-v0.1 --lora random --hessian raw --batch_size 1 -python compute_influence.py --model_name mistralai/Mistral-7B-v0.1 --lora random --hessian raw --split valid -python compute_influence.py --model_name mistralai/Mistral-7B-v0.1 --lora random --hessian raw --split external -# python compute_influence.py --model_name mistralai/Mistral-7B-v0.1 --lora random --hessian raw --split generated diff --git a/examples/language_modeling/scripts/mistral_instruct.sh b/examples/language_modeling/scripts/mistral_instruct.sh deleted file mode 100755 index c7af42df..00000000 --- a/examples/language_modeling/scripts/mistral_instruct.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/bash -#SBATCH --job-name=pythia -#SBATCH --output /home/sangkeuc/logs/\%j.out -#SBATCH --err /home/sangkeuc/logs/\%j.err -#SBATCH --nodes=1 -#SBATCH --gres=gpu:A100_80GB:2 -#SBATCH --mem=120GB -#SBATCH --time 24:00:00 -#SBATCH --mail-user=sangkeuc@andrew.cmu.edu -#SBATCH --mail-type=START,END,FAIL -#SBATCH --partition=general - -source ~/.bashrc -conda init -conda activate analog - -set -x - -accelerate launch --num_processes 2 --num_machines 1 --multi_gpu --main_process_port 63152 extract_log.py --model_name mistralai/Mistral-7B-Instruct-v0.2 --lora random --hessian raw --batch_size 1 --mlp_only -# python compute_influence.py --model_name mistralai/Mistral-7B-Instruct-v0.2 --lora random --hessian raw --split valid -# python compute_influence.py --model_name mistralai/Mistral-7B-Instruct-v0.2 --lora random --hessian raw --split external -# python compute_influence.py --model_name mistralai/Mistral-7B-Instruct-v0.2 --lora random --hessian raw --split generated diff --git a/examples/language_modeling/scripts/pythia.sh b/examples/language_modeling/scripts/pythia.sh new file mode 100755 index 00000000..e5ebf627 --- /dev/null +++ b/examples/language_modeling/scripts/pythia.sh @@ -0,0 +1,20 @@ +#!/usr/bin/bash +#SBATCH --job-name=if +#SBATCH --output /data/tir/projects/tir3/users/hahn2/logix/examples/language_modeling/slurm-out/\%j.out +#SBATCH --err /data/tir/projects/tir3/users/hahn2/logix/examples/language_modeling/slurm-out/\%j.err +#SBATCH --nodes=1 +#SBATCH --gres=gpu:A100_80GB:1 +#SBATCH --mem=256GB +#SBATCH --time 48:00:00 +#SBATCH --mail-user=hahn2@andrew.cmu.edu +#SBATCH --mail-type=START,END,FAIL +#SBATCH --partition=general + +source ~/.bashrc +conda init +conda activate if + +set -x + +# accelerate launch --num_processes 2 --num_machines 1 --multi_gpu --main_process_port 63221 extract_log.py --model_name EleutherAI/pythia-1.4b --lora random --hessian raw --batch_size 1 --mlp_only --data_name openwebtext +python compute_influence.py --model_name EleutherAI/pythia-1.4b --lora random --hessian raw --split generated --mlp_only --data_name openwebtext --mode cosine diff --git a/examples/language_modeling/scripts/run.sh b/examples/language_modeling/scripts/run.sh deleted file mode 100755 index d6e46711..00000000 --- a/examples/language_modeling/scripts/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -if [ $# -eq 0 ]; then - echo "Please provide the model name as an argument." - exit 1 -fi - -model_name=$1 - -# raw + random lora -#accelerate launch --multi_gpu --num_processes 2 --main_process_port 29505 extract_log.py --model_name "$model_name" --lora random --hessian raw --batch_size 2 -#CUDA_VISIBLE_DEVICES=0 python extract_log.py --model_name "$model_name" --lora random --hessian raw --batch_size 2 -#python compute_influence.py --model_name "$model_name" --lora random --hessian raw --split valid -python compute_influence.py --model_name "$model_name" --lora random --hessian raw --split external -python compute_influence.py --model_name "$model_name" --lora random --hessian raw --split generated diff --git a/examples/language_modeling/scripts/tinyllama.sh b/examples/language_modeling/scripts/tinyllama.sh deleted file mode 100755 index cf127946..00000000 --- a/examples/language_modeling/scripts/tinyllama.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/bash -#SBATCH --job-name=pythia -#SBATCH --output /home/sangkeuc/logs/\%j.out -#SBATCH --err /home/sangkeuc/logs/\%j.err -#SBATCH --nodes=1 -#SBATCH --gres=gpu:L40:4 -#SBATCH --mem=128GB -#SBATCH --time 24:00:00 -#SBATCH --mail-user=sangkeuc@andrew.cmu.edu -#SBATCH --mail-type=START,END,FAIL -#SBATCH --partition=general - -source ~/.bashrc -conda init -conda activate analog - -#model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" -model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - -set -x - -#accelerate launch --multi_gpu --num_processes 4 --main_process_port 19523 extract_log.py --model_name TinyLlama/TinyLlama-1.1B-Chat-v1.0 --lora random --hessian raw --batch_size 2 -#python extract_log.py --model_name "$model_name" --lora random --hessian raw --batch_size 2 -#python compute_influence.py --model_name "$model_name" --lora random --hessian raw --split valid -python compute_influence.py --model_name TinyLlama/TinyLlama-1.1B-Chat-v1.0 --lora random --hessian raw --split external -#CUDA_VISIBLE_DEVICES=0 python compute_influence.py --model_name "$model_name" --lora random --hessian raw --split generated diff --git a/examples/language_modeling/utils.py b/examples/language_modeling/utils.py index ace99737..3592e99d 100644 --- a/examples/language_modeling/utils.py +++ b/examples/language_modeling/utils.py @@ -1,22 +1,18 @@ -import gc +import copy import os import random -import struct from itertools import chain -import copy -from typing import List, Optional, Tuple import numpy as np import torch import torch.nn as nn -from datasets import load_dataset, load_from_disk +from datasets import Dataset, load_dataset, load_from_disk from transformers import ( AutoConfig, AutoModelForCausalLM, AutoTokenizer, - default_data_collator, - DataCollatorWithPadding, PreTrainedTokenizer, + default_data_collator, ) from transformers.pytorch_utils import Conv1D @@ -74,9 +70,7 @@ def get_model(model_name, cache_dir) -> nn.Module: return LanguageModel(model_name, cache_dir) -def get_tokenizer( - model_name, cache_dir, add_padding_token=False -) -> PreTrainedTokenizer: +def get_tokenizer(model_name, cache_dir) -> PreTrainedTokenizer: tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast=True, @@ -84,7 +78,7 @@ def get_tokenizer( cache_dir=cache_dir, ) - if tokenizer.pad_token is None and add_padding_token: + if tokenizer.pad_token is None: print("No pad token found. Setting `` as a pad token.") tokenizer.pad_token = "" if "" not in tokenizer.get_vocab(): @@ -97,60 +91,57 @@ def get_tokenizer( def get_dataset( model_name: str, tokenizer: PreTrainedTokenizer, - data_path: str, - data_name: Optional[str] = None, split: str = "train", - sample_ratio: float = 0.005, cache_dir: str = None, + data_name="openwebtext", ) -> torch.utils.data.DataLoader: - model_name_strip = model_name.split("/")[-1] - save_data_name = data_path if data_name is None else data_name - save_data_name = save_data_name.split("/")[-1] - if os.path.exists( - os.path.join(cache_dir, f"{model_name_strip}_{save_data_name}.pt") - ): - print("[*] Loading from cached data...") - lm_datasets = load_from_disk( - os.path.join(cache_dir, f"{model_name_strip}_{save_data_name}.pt") + assert split in ["train", "valid", "generated", "external"] + + model_name_split = model_name.split("/")[-1] + split_key = "validation" if split == "valid" else "train" + seed = 42 + fname = f"{model_name_split}_{data_name}_{seed}.pt" + if split in ["train", "valid"] and os.path.exists(os.path.join(cache_dir, fname)): + print(f"[*] Loading from cached data... {fname}") + lm_datasets = load_from_disk(os.path.join(cache_dir, fname)) + num_tokens = 1_000_000_000 # we take 1B tokens + seq_len = len(lm_datasets["train"][0]["input_ids"]) + num_examples = min(num_tokens // seq_len, len(lm_datasets["train"])) + lm_datasets[split_key] = lm_datasets[split_key].select(range(num_examples)) + print( + f"Using {num_examples} examples for training. {num_examples} * {seq_len} = {num_tokens:,}" ) - return lm_datasets[split] + return lm_datasets[split_key] # Prepare raw dataset - if data_path == "external": - data_kwargs = { - "path": "json", - "data_files": "./custom_data/external/data.json", - "cache_dir": cachd_dir, - "num_proc": 4, - } - elif data_path == "generated": - data_kwargs = { - "path": "json", - "data_files": f"./custom_data/generated/{model_name_strip}/data.json", - "cache_dir": cache_dir, - "num_proc": 4, - } + if split in ["train", "valid"]: + data_path = "openwebtext" + data_kwargs = {} + elif split in ["external"]: + data_path = "json" + data_kwargs = {"data_files": "./custom_data/external/data.json"} else: + data_path = "json" data_kwargs = { - "path": data_path, - "name": data_name, - "cache_dir": cache_dir, - "num_proc": 4, + "data_files": f"./custom_data/generated/{model_name_split}/data.json" } - raw_datasets = load_dataset(**data_kwargs) - - if sample_ratio is not None: - sampled_train = raw_datasets["train"].train_test_split( - test_size=0.005, shuffle=True, seed=42 - ) - raw_datasets["train"] = sampled_train["test"] + print(f"[*] Loading data from {data_path} {data_kwargs}...") + raw_datasets = load_dataset(data_path, **data_kwargs) + if split in ["train", "valid"]: + print(f"[*] Splitting data...") + raw_datasets = raw_datasets[split].train_test_split(test_size=0.95, seed=seed) + if split == "train": + raw_datasets.pop("test") # Tokenize dataset column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(examples): - return tokenizer(examples[text_column_name]) + ret = tokenizer(examples[text_column_name]) + if "prompt" in examples: + ret["prompt"] = tokenizer(examples["prompt"])["input_ids"] + return ret tokenized_datasets = raw_datasets.map( tokenize_function, @@ -162,7 +153,7 @@ def tokenize_function(examples): ) # Group text - if data_path not in ["generated", "external"]: + if split in ["train", "valid"]: block_size = 512 def group_texts(examples): @@ -187,43 +178,52 @@ def group_texts(examples): for idx, token in enumerate(label): if token == tokenizer.pad_token_id: label[idx] = -100 + if "prompt" in examples: + for idx, (prompt, label) in enumerate( + zip(examples["prompt"], examples["labels"]) + ): + for _ in range(len(prompt)): + label[_] = -100 + examples["labels"][idx] = label return examples lm_datasets = tokenized_datasets.map( group_texts, batched=True, - num_proc=4, + num_proc=1, load_from_cache_file=True, desc=f"Grouping texts in chunks of {block_size}", ) - print("[*] Saving data to disk...") - lm_datasets.save_to_disk( - os.path.join(cache_dir, f"{model_name_strip}_{save_data_name}.pt") - ) + if split in ["train", "valid"]: + print(f"[*] Saving data to disk...") + lm_datasets.save_to_disk(os.path.join(cache_dir, fname)) - return lm_datasets[split] + if "prompt" in lm_datasets[split_key].column_names: + lm_datasets[split_key] = lm_datasets[split_key].remove_columns("prompt") + return lm_datasets[split_key] def get_loader( model_name: str, tokenizer: PreTrainedTokenizer, batch_size: int, - data_path: str, - data_name: Optional[str] = None, split: str = "train", cache_dir: str = None, + data_name="openwebtext", ) -> torch.utils.data.DataLoader: dataset = get_dataset( model_name=model_name, tokenizer=tokenizer, - data_path=data_path, - data_name=data_name, split=split, cache_dir=cache_dir, + data_name=data_name, ) dataloader = torch.utils.data.DataLoader( - dataset, batch_size=batch_size, shuffle=False, collate_fn=default_data_collator + dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=default_data_collator, ) return dataloader