From 1bbd4e976ce516624458e185b91d3c079ad88d67 Mon Sep 17 00:00:00 2001 From: guglie Date: Sun, 29 Mar 2020 15:44:26 +0200 Subject: [PATCH 1/2] More pytorchic using nn.GELU module --- models.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/models.py b/models.py index deba76f..480e415 100644 --- a/models.py +++ b/models.py @@ -22,7 +22,6 @@ class Config(NamedTuple): n_layers: int = 12 # Numher of Hidden Layers n_heads: int = 12 # Numher of Heads in Multi-Headed Attention Layers dim_ff: int = 768*4 # Dimension of Intermediate Layers in Positionwise Feedforward Net - #activ_fn: str = "gelu" # Non-linear Activation Function Type in Hidden Layers p_drop_hidden: float = 0.1 # Probability of Dropout of various Hidden Layers p_drop_attn: float = 0.1 # Probability of Dropout of Attention Layers max_len: int = 512 # Maximum Length for Positional Embeddings @@ -33,11 +32,6 @@ def from_json(cls, file): return cls(**json.load(open(file, "r"))) -def gelu(x): - "Implementation of the gelu activation function by Hugging Face" - return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) - - class LayerNorm(nn.Module): "A layernorm module in the TF style (epsilon inside the square root)." def __init__(self, cfg, variance_epsilon=1e-12): @@ -112,13 +106,15 @@ class PositionWiseFeedForward(nn.Module): """ FeedForward Neural Networks for each position """ def __init__(self, cfg): super().__init__() - self.fc1 = nn.Linear(cfg.dim, cfg.dim_ff) - self.fc2 = nn.Linear(cfg.dim_ff, cfg.dim) - #self.activ = lambda x: activ_fn(cfg.activ_fn, x) + self.layers = nn.Sequential( + nn.Linear(cfg.dim, cfg.dim_ff), + nn.Linear(cfg.dim_ff, cfg.dim), + nn.GELU() + ) def forward(self, x): # (B, S, D) -> (B, S, D_ff) -> (B, S, D) - return self.fc2(gelu(self.fc1(x))) + return self.layers(x) class Block(nn.Module): From eae9f86a7beacc1ebebca4c97c7484e09ac1a137 Mon Sep 17 00:00:00 2001 From: guglie Date: Sun, 29 Mar 2020 15:46:44 +0200 Subject: [PATCH 2/2] Update pretrain.py --- pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain.py b/pretrain.py index 6a7b720..b850405 100644 --- a/pretrain.py +++ b/pretrain.py @@ -169,7 +169,7 @@ def __init__(self, cfg): self.fc = nn.Linear(cfg.dim, cfg.dim) self.activ1 = nn.Tanh() self.linear = nn.Linear(cfg.dim, cfg.dim) - self.activ2 = models.gelu + self.activ2 = nn.GELU() self.norm = models.LayerNorm(cfg) self.classifier = nn.Linear(cfg.dim, 2) # decoder is shared with embedding layer