from typing import Optional, Union import torch from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache from transformers.masking_utils import create_bidirectional_mask, create_causal_mask from transformers.models.bert.modeling_bert import BertEncoder, BertPooler, BertPreTrainedModel, BertOnlyMLMHead from transformers.modeling_outputs import ( BaseModelOutputWithPoolingAndCrossAttentions, MaskedLMOutput, SequenceClassifierOutput, ) from transformers.processing_utils import Unpack from transformers.utils import TransformersKwargs, auto_docstring, logging from transformers.utils.generic import can_return_tuple, merge_with_config_defaults from transformers.utils.output_capturing import capture_outputs from .configuration_bert_hash import BertHashConfig logger = logging.get_logger(__name__) class BertHashTokens(nn.Module): """ Module that embeds token vocabulary to an intermediate embeddings layer then projects those embeddings to the hidden size. The number of projections is like a hash. Setting the projections parameter to 5 is like generating a 160-bit hash (5 x float32) for each token. That hash is then projected to the hidden size. This significantly reduces the number of parameters necessary for token embeddings. For example: Standard token embeddings: 30,522 (vocab size) x 768 (hidden size) = 23,440,896 parameters 23,440,896 x 4 (float32) = 93,763,584 bytes Hash token embeddings: 30,522 (vocab size) x 5 (hash buckets) + 5 x 768 (projection matrix)= 156,450 parameters 156,450 x 4 (float32) = 625,800 bytes """ def __init__(self, config): super().__init__() self.config = config # Token embeddings self.embeddings = nn.Embedding(config.vocab_size, config.projections, padding_idx=config.pad_token_id) # Token embeddings projections self.projections = nn.Linear(config.projections, config.hidden_size) def forward(self, input_ids): # Project embeddings to hidden size return self.projections(self.embeddings(input_ids)) class BertHashEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings.""" def __init__(self, config): super().__init__() self.word_embeddings = BertHashTokens(config) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) def forward( self, input_ids: torch.LongTensor | None = None, token_type_ids: torch.LongTensor | None = None, position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, past_key_values_length: int = 0, ) -> torch.Tensor: if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] batch_size, seq_length = input_shape device = input_ids.device if input_ids is not None else inputs_embeds.device if position_ids is None: position_ids = ( torch.arange(seq_length, dtype=torch.long, device=device) .unsqueeze(0) .expand(batch_size, seq_length) ) # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves # issue #5664 if token_type_ids is None: if hasattr(self, "token_type_ids"): # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0]) buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1) buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids) token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length) else: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + token_type_embeddings position_embeddings = self.position_embeddings(position_ids) embeddings = embeddings + position_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @auto_docstring( custom_intro=""" The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in [Attention is all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ ) class BertHashModel(BertPreTrainedModel): config_class = BertHashConfig _no_split_modules = ["BertEmbeddings", "BertLayer"] def __init__(self, config, add_pooling_layer=True): r""" add_pooling_layer (bool, *optional*, defaults to `True`): Whether to add a pooling layer """ super().__init__(config) self.config = config self.gradient_checkpointing = False self.embeddings = BertHashEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) if add_pooling_layer else None # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.embeddings.word_embeddings.embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value @merge_with_config_defaults @capture_outputs @auto_docstring def forward( self, input_ids: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, token_type_ids: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, past_key_values: Cache | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: use_cache = False if use_cache and past_key_values is None: past_key_values = ( EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) if encoder_hidden_states is not None or self.config.is_encoder_decoder else DynamicCache(config=self.config) ) past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) attention_mask, encoder_attention_mask = self._create_attention_masks( attention_mask=attention_mask, encoder_attention_mask=encoder_attention_mask, embedding_output=embedding_output, encoder_hidden_states=encoder_hidden_states, past_key_values=past_key_values, ) encoder_outputs = self.encoder( embedding_output, attention_mask=attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, use_cache=use_cache, position_ids=position_ids, **kwargs, ) sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, ) def _create_attention_masks( self, attention_mask, encoder_attention_mask, embedding_output, encoder_hidden_states, past_key_values, ): if self.config.is_decoder: attention_mask = create_causal_mask( config=self.config, inputs_embeds=embedding_output, attention_mask=attention_mask, past_key_values=past_key_values, ) else: attention_mask = create_bidirectional_mask( config=self.config, inputs_embeds=embedding_output, attention_mask=attention_mask, ) if encoder_attention_mask is not None: encoder_attention_mask = create_bidirectional_mask( config=self.config, inputs_embeds=embedding_output, attention_mask=encoder_attention_mask, encoder_hidden_states=encoder_hidden_states, ) return attention_mask, encoder_attention_mask @auto_docstring class BertForMaskedLM(BertPreTrainedModel): _tied_weights_keys = { "cls.predictions.decoder.weight": "bert.embeddings.word_embeddings.weight", "cls.predictions.decoder.bias": "cls.predictions.bias", } config_class = BertHashConfig def __init__(self, config): super().__init__(config) if config.is_decoder: logger.warning( "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for " "bi-directional self-attention." ) self.bert = BertHashModel(config, add_pooling_layer=False) self.cls = BertOnlyMLMHead(config) # Initialize weights and apply final processing self.post_init() def get_output_embeddings(self): return self.cls.predictions.decoder def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias @can_return_tuple @auto_docstring def forward( self, input_ids: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, token_type_ids: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor] | MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, return_dict=True, **kwargs, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) masked_lm_loss = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) @auto_docstring( custom_intro=""" Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """ ) class BertHashForSequenceClassification(BertPreTrainedModel): config_class = BertHashConfig def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.config = config self.bert = BertHashModel(config) classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) self.dropout = nn.Dropout(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) # Initialize weights and apply final processing self.post_init() @can_return_tuple @auto_docstring def forward( self, input_ids: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, token_type_ids: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor] | SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, return_dict=True, **kwargs, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )