-
Notifications
You must be signed in to change notification settings - Fork 144
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Document Huggingface Cloud #1777
Comments
Added some more time series models
|
# use the "amazon/chronos-t5-tiny" time series model feeding in the SPY historical daily data
def use_times_series_model(self):
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from chronos import ChronosPipeline
pipeline = ChronosPipeline.from_pretrained(
"amazon/chronos-t5-tiny",
device_map="cuda",
torch_dtype=torch.bfloat16,
)
spy = self.add_equity("SPY", resolution=Resolution.DAILY)
df = self.history([spy.symbol], start=datetime(2023, 1, 1), end=datetime(2024, 1, 1))
# context must be either a 1D tensor, a list of 1D tensors,
# or a left-padded 2D tensor with batch as the first dimension
context = torch.tensor(df["close"])
prediction_length = 12
forecast = pipeline.predict(context, prediction_length)
# visualize the forecast
forecast_index = range(len(df), len(df) + prediction_length)
low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)
self.log(f'Prediction low: {low}. median: {median}. high: {high}. forecast: {str(forecast)}') |
see TODO: assigning a random positive/negative value to a news, but we can do this based on the news itself def get_object_store_model_path(self, model_name):
adjusted_model_name = model_name.replace('/', '-')
return self.object_store.get_file_path(f'llm/fine-tune/{adjusted_model_name}/')
# example of how to fine tune the "ProsusAI/finbert" model from the cache and store into the object store
def fine_tune_finbert_model(self):
### CREATE MODEL
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer
model_name = "ProsusAI/finbert"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=3, from_pt = True)
### FETCH THE DATA
from datasets import Dataset
def fetch_tiingo_news_dataset(start_date, end_date):
aapl = self.add_equity("AAPL", Resolution.MINUTE)
dataset_symbol = self.add_data(TiingoNews, aapl.symbol).symbol
history_df = self.history(dataset_symbol, start_date, end_date, Resolution.DAILY)
history_df = history_df.reset_index()[['description']]
# TODO: Random positive/negative, can change this to be based on the news tags/description etc
history_df['label'] = np.random.randint(0,2, size=len(history_df))
# rename the description to text
history_df = history_df.rename(columns={"description": "text"})
# create the dataset from the pandas dataframe
dataset = Dataset.from_pandas(history_df)
def preprocess_function(examples):
return tokenizer(examples['text'], padding='max_length', truncation=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
return encoded_dataset
# Split dataset
train_dataset = fetch_tiingo_news_dataset(datetime(2023, 10, 1), datetime(2024, 1, 1))
eval_dataset = fetch_tiingo_news_dataset(datetime(2024, 1, 1), datetime(2024, 2, 1))
### FINE TUNE THE MODEL
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss)
tf_dataset = model.prepare_tf_dataset(train_dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)
model.fit(tf_dataset, epochs=2, steps_per_epoch=115)
# we set the output directory to an object store location
output_dir = self.get_object_store_model_path(model_name)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# use the "ProsusAI/finbert" model from the cache
def use_original_finbert_model(self):
self.use_tf_finbert_model("ProsusAI/finbert")
# use the fine tuned "ProsusAI/finbert" model from the object store
def use_fine_tuned_finbert_model(self):
from pathlib import Path
model_dir = self.get_object_store_model_path("ProsusAI/finbert")
self.use_tf_finbert_model(Path(model_dir))
def use_tf_finbert_model(self, model_path):
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import tensorflow as tf
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = TFAutoModelForSequenceClassification.from_pretrained(model_path, local_files_only=True)
# this is for debugging & comparing with other models, fine tuning
self.log(f"Using model: {str(model)}")
for layer in model.layers:
self.log(f"LAYER: {layer.weights}")
# Prepare the input sentences
sentences = ["Stocks rallied and the British pound gained."]
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='tf')
# Get the model outputs
outputs = model(**inputs)
# Apply softmax to the outputs to get probabilities
res = tf.nn.softmax(outputs.logits, axis=-1).numpy()
self.log(str(res)) |
see TODO: the model wants evenly sampled data, but equity prices aren't there on a weekend -> maybe we can fake the time series, basically just an incremental time counter # use the "Salesforce/moirai-1.0-R-{SIZE}" time series model feeding in the SPY historical daily data
def use_times_series_model_moirai(self):
import torch
import matplotlib.pyplot as plt
import pandas as pd
from gluonts.dataset.pandas import PandasDataset
from gluonts.dataset.split import split
from uni2ts.eval_util.plot import plot_single
from uni2ts.model.moirai import MoiraiForecast, MoiraiModule
SIZE = "small" # model size: choose from {'small', 'base', 'large'}
PDT = 20 # prediction length: any positive integer
CTX = 200 # context length: any positive integer
PSZ = "auto" # patch size: choose from {"auto", 8, 16, 32, 64, 128}
BSZ = 32 # batch size: any positive integer
TEST = 100 # test set length: any positive integer
# Read data into pandas DataFrame
spy = self.add_equity("SPY", resolution=Resolution.DAILY)
df = self.history([spy.symbol], start=datetime(2023, 1, 1), end=datetime(2024, 1, 1))
adjusted_df = df.reset_index()[['time', 'close']]
adjusted_df = adjusted_df.rename(columns={'close':'target'})
adjusted_df['time'] = pd.to_datetime(adjusted_df['time'])
adjusted_df.set_index('time', inplace=True)
# TODO: the models want's the data evenly sampled
adjusted_df = adjusted_df.resample('D').asfreq()
ds = PandasDataset(adjusted_df, freq="D")
# Split into train/test set
train, test_template = split(
ds, offset=-TEST
) # assign last TEST time steps as test set
# Construct rolling window evaluation
test_data = test_template.generate_instances(
prediction_length=PDT, # number of time steps for each prediction
windows=TEST // PDT, # number of windows in rolling window evaluation
distance=PDT, # number of time steps between each window - distance=PDT for non-overlapping windows
)
# Prepare pre-trained model by downloading model weights from huggingface hub
model = MoiraiForecast(
module=MoiraiModule.from_pretrained(f"Salesforce/moirai-1.0-R-{SIZE}"),
prediction_length=PDT,
context_length=CTX,
patch_size=PSZ,
num_samples=100,
target_dim=1,
feat_dynamic_real_dim=ds.num_feat_dynamic_real,
past_feat_dynamic_real_dim=ds.num_past_feat_dynamic_real,
)
predictor = model.create_predictor(batch_size=BSZ)
forecasts = predictor.predict(test_data.input)
forecast_it = iter(forecasts)
forecast = next(forecast_it)
self.Log(str(forecast)) |
Example of fine tuning the amazon chronos time series model using daily data & storing it to the object store see TODO: the model wants evenly sampled data, but equity prices aren't there on a weekend -> maybe we can fake the time series, basically just an incremental time counter. Similar to the def get_object_store_model_path(self, model_name):
adjusted_model_name = model_name.replace('/', '-')
return self.object_store.get_file_path(f'llm/fine-tune/{adjusted_model_name}/')
def fine_tune_chronos_model(self):
# Read data into pandas DataFrame
spy = self.add_equity("SPY", resolution=Resolution.DAILY)
df = self.history([spy.symbol], start=datetime(2023, 1, 1), end=datetime(2024, 1, 1))
adjusted_df = df.reset_index()[['time', 'close']]
adjusted_df = adjusted_df.rename(columns={'close':'target'})
adjusted_df['time'] = pd.to_datetime(adjusted_df['time'])
adjusted_df.set_index('time', inplace=True)
# TODO: the models want's the data evenly sampled
adjusted_df = adjusted_df.resample('D').asfreq()
model_name = "amazon/chronos-t5-tiny"
model_dir = self.get_object_store_model_path(model_name)
self.train_chronos([adjusted_df],
model_id=model_name,
output_dir=model_dir,
# Requires Ampere GPUs (e.g., A100)
tf32=False,
# TODO Change me, 10 is just a quick tune
max_steps=10
)
def train_chronos(self, training_data,
probability: Optional[str] = None,
context_length: int = 512,
prediction_length: int = 64,
min_past: int = 64,
max_steps: int = 200_000,
save_steps: int = 50_000,
log_steps: int = 500,
per_device_train_batch_size: int = 32,
learning_rate: float = 1e-3,
optim: str = "adamw_torch_fused",
shuffle_buffer_length: int = 100,
gradient_accumulation_steps: int = 2,
model_id: str = "google/t5-efficient-tiny",
model_type: str = "seq2seq",
random_init: bool = False,
tie_embeddings: bool = False,
output_dir: str = "./output/",
tf32: bool = True,
torch_compile: bool = True,
tokenizer_class: str = "MeanScaleUniformBins",
tokenizer_kwargs: str = "{'low_limit': -15.0, 'high_limit': 15.0}",
n_tokens: int = 4096,
n_special_tokens: int = 2,
pad_token_id: int = 0,
eos_token_id: int = 1,
use_eos_token: bool = True,
lr_scheduler_type: str = "linear",
warmup_ratio: float = 0.0,
dataloader_num_workers: int = 1,
max_missing_prop: float = 0.9,
num_samples: int = 20,
temperature: float = 1.0,
top_k: int = 50,
top_p: float = 1.0,
seed: Optional[int] = None):
from ast import literal_eval
from pathlib import Path
from functools import partial
from typing import List, Iterator, Optional, Dict
from torch.utils.data import IterableDataset, get_worker_info
from transformers import Trainer, TrainingArguments, set_seed
from gluonts.dataset.pandas import PandasDataset
from gluonts.itertools import Filter
from chronos import ChronosConfig
# load the helper traning scripts and set the logger instance
from chronos.scripts.training.train import ChronosDataset, has_enough_observations, load_model
from chronos.scripts.training import train
from logging import getLogger, INFO
train.logger = getLogger()
train.logger.setLevel(INFO)
output_dir = Path(output_dir)
if isinstance(probability, str):
probability = literal_eval(probability)
elif probability is None:
probability = [1.0 / len(training_data)] * len(training_data)
if isinstance(tokenizer_kwargs, str):
tokenizer_kwargs = literal_eval(tokenizer_kwargs)
assert isinstance(tokenizer_kwargs, dict)
assert model_type in ["seq2seq", "causal"]
if not model_type == "seq2seq":
raise NotImplementedError("Only seq2seq models are currently supported")
if seed is None:
import random
seed = random.randint(0, 2**32)
# transformers
set_seed(seed=seed)
self.log(f"Output dir: {output_dir}. Using SEED: {seed}. Mixing probabilities: {probability}")
self.log(f"Loading and filtering {len(training_data)} datasets for training: {training_data}")
train_datasets = [Filter(
partial(
has_enough_observations,
min_length=min_past + prediction_length,
max_missing_prop=max_missing_prop,
),
PandasDataset(data_frame, freq="D"),
)
for data_frame in training_data
]
self.log("Initializing model")
model = load_model(
model_id=model_id,
model_type=model_type,
vocab_size=n_tokens,
random_init=random_init,
tie_embeddings=tie_embeddings,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
)
chronos_config = ChronosConfig(
tokenizer_class=tokenizer_class,
tokenizer_kwargs=tokenizer_kwargs,
n_tokens=n_tokens,
n_special_tokens=n_special_tokens,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
use_eos_token=use_eos_token,
model_type=model_type,
context_length=context_length,
prediction_length=prediction_length,
num_samples=num_samples,
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
# Add extra items to model config so that it's saved in the ckpt
model.config.chronos_config = chronos_config.__dict__
shuffled_train_dataset = ChronosDataset(
datasets=train_datasets,
probabilities=probability,
tokenizer=chronos_config.create_tokenizer(),
context_length=context_length,
prediction_length=prediction_length,
min_past=min_past,
mode="training",
).shuffle(shuffle_buffer_length=shuffle_buffer_length)
# Define training args
training_args = TrainingArguments(
output_dir=str(output_dir),
per_device_train_batch_size=per_device_train_batch_size,
learning_rate=learning_rate,
lr_scheduler_type=lr_scheduler_type,
warmup_ratio=warmup_ratio,
optim=optim,
logging_dir=str(output_dir / "train-logs"),
logging_strategy="steps",
logging_steps=log_steps,
save_strategy="steps",
save_steps=save_steps,
report_to=["tensorboard"],
max_steps=max_steps,
gradient_accumulation_steps=gradient_accumulation_steps,
dataloader_num_workers=dataloader_num_workers,
tf32=tf32, # remove this if not using Ampere GPUs (e.g., A100)
torch_compile=torch_compile,
ddp_find_unused_parameters=False,
remove_unused_columns=False,
)
# Create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=shuffled_train_dataset,
)
self.log("Training start...")
trainer.train()
self.log("Training ended!")
model.save_pretrained(output_dir) USAGE def get_object_store_model_path(self, model_name):
adjusted_model_name = model_name.replace('/', '-')
return self.object_store.get_file_path(f'llm/fine-tune/{adjusted_model_name}/')
def use_times_series_model_chronos_from_cache(self):
self.use_times_series_model_chronos("amazon/chronos-t5-tiny")
def use_fine_tuned_chronos_model(self):
from pathlib import Path
model_dir = self.get_object_store_model_path("amazon/chronos-t5-tiny")
self.use_times_series_model_chronos(Path(model_dir))
# use the given time series model name feeding in the SPY historical daily data
def use_times_series_model_chronos(self, model_name_or_path):
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from chronos import ChronosPipeline
pipeline = ChronosPipeline.from_pretrained(
model_name_or_path,
device_map="cuda",
torch_dtype=torch.bfloat16,
)
self.log(f"Using model: {str(pipeline)}")
spy = self.add_equity("SPY", resolution=Resolution.DAILY)
df = self.history([spy.symbol], start=datetime(2023, 1, 1), end=datetime(2024, 1, 1))
# context must be either a 1D tensor, a list of 1D tensors,
# or a left-padded 2D tensor with batch as the first dimension
context = torch.tensor(df["close"])
prediction_length = 12
forecast = pipeline.predict(context, prediction_length)
# visualize the forecast
forecast_index = range(len(df), len(df) + prediction_length)
low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)
self.log(f'Prediction low: {low}. median: {median}. high: {high}. forecast: {str(forecast)}') |
These models are available in live, backtesting & research in the cloud environment.
Access installed models and their revisions
Current output:
Example of how to use
ProsusAI/finbert
modelExample of how to train a model
TODO WIP: pending adding new libraries to improve usage
The text was updated successfully, but these errors were encountered: