diff --git a/mmdet/datasets/transforms/text_transformers.py b/mmdet/datasets/transforms/text_transformers.py index 12a0e57db3d..6aaf6bd61df 100644 --- a/mmdet/datasets/transforms/text_transformers.py +++ b/mmdet/datasets/transforms/text_transformers.py @@ -60,7 +60,7 @@ def check_for_positive_overflow(gt_bboxes, gt_labels, text, tokenizer, keep_gt_labels.append(gt_labels[i]) return gt_bboxes[keep_box_index], np.array( - keep_gt_labels, dtype=np.long), length + keep_gt_labels, dtype=np.long), length, keep_box_index def generate_senetence_given_labels(positive_label_list, negative_label_list, @@ -164,7 +164,7 @@ def od_aug(self, results): if '/' in value: text[key] = random.choice(value.split('/')).strip() - gt_bboxes, gt_labels, positive_caption_length = \ + gt_bboxes, gt_labels, positive_caption_length, keep_box_index = \ check_for_positive_overflow(gt_bboxes, gt_labels, text, self.tokenizer, self.max_tokens) @@ -217,7 +217,7 @@ def od_aug(self, results): negative_max_length -= len(tokenized) - if negative_max_length > 0: + if negative_max_length >= 0: screened_negative_label_list.append(negative_label) else: break @@ -232,6 +232,8 @@ def od_aug(self, results): results['gt_bboxes'] = gt_bboxes results['gt_bboxes_labels'] = gt_labels + if results.get('gt_ignore_flags', None) is not None: + results['gt_ignore_flags'] = results['gt_ignore_flags'][keep_box_index] results['text'] = pheso_caption results['tokens_positive'] = label_to_positions diff --git a/mmdet/models/detectors/grounding_dino.py b/mmdet/models/detectors/grounding_dino.py index b1ab7c2da16..411ebd7cb92 100644 --- a/mmdet/models/detectors/grounding_dino.py +++ b/mmdet/models/detectors/grounding_dino.py @@ -162,7 +162,8 @@ def get_tokens_and_prompts( [caption_string], padding='max_length' if self.language_model.pad_to_max else 'longest', - return_tensors='pt') + return_tensors='pt', + add_special_tokens=False) entities = original_caption else: if not original_caption.endswith('.'): @@ -175,7 +176,8 @@ def get_tokens_and_prompts( [original_caption], padding='max_length' if self.language_model.pad_to_max else 'longest', - return_tensors='pt') + return_tensors='pt', + add_special_tokens=False) tokens_positive, noun_phrases = run_ner(original_caption) entities = noun_phrases caption_string = original_caption @@ -224,7 +226,8 @@ def get_tokens_positive_and_prompts( [original_caption], padding='max_length' if self.language_model.pad_to_max else 'longest', - return_tensors='pt') + return_tensors='pt', + add_special_tokens=False) positive_map_label_to_token, positive_map = \ self.get_positive_map(tokenized, tokens_positive) @@ -281,7 +284,8 @@ def get_tokens_positive_and_prompts_chunked( caption_string, tokens_positive = self.to_plain_text_prompts( original_caption_chunked[i]) tokenized = self.language_model.tokenizer([caption_string], - return_tensors='pt') + return_tensors='pt', + add_special_tokens=False) if tokenized.input_ids.shape[1] > self.language_model.max_tokens: warnings.warn('Inputting a text that is too long will result ' 'in poor prediction performance. ' @@ -439,7 +443,8 @@ def loss(self, batch_inputs: Tensor, [text_prompt], padding='max_length' if self.language_model.pad_to_max else 'longest', - return_tensors='pt') + return_tensors='pt', + add_special_tokens=False) new_tokens_positive = [ token_positive[label.item()] for label in gt_label ] @@ -477,7 +482,7 @@ def loss(self, batch_inputs: Tensor, positive_maps.append(positive_map) new_text_prompts.append(caption_string) - text_dict = self.language_model(new_text_prompts) + text_dict = self.language_model(new_text_prompts, add_special_tokens=False) if self.text_feat_map is not None: text_dict['embedded'] = self.text_feat_map(text_dict['embedded']) @@ -553,7 +558,7 @@ def predict(self, batch_inputs, batch_data_samples, rescale: bool = True): for b in range(len(text_prompts[0])): text_prompts_once = [text_prompts[0][b]] token_positive_maps_once = token_positive_maps[0][b] - text_dict = self.language_model(text_prompts_once) + text_dict = self.language_model(text_prompts_once, add_special_tokens=False) # text feature map layer if self.text_feat_map is not None: text_dict['embedded'] = self.text_feat_map( @@ -577,7 +582,7 @@ def predict(self, batch_inputs, batch_data_samples, rescale: bool = True): is_rec_tasks = [False] * len(results_list) else: # extract text feats - text_dict = self.language_model(list(text_prompts)) + text_dict = self.language_model(list(text_prompts), add_special_tokens=False) # text feature map layer if self.text_feat_map is not None: text_dict['embedded'] = self.text_feat_map( diff --git a/mmdet/models/language_models/bert.py b/mmdet/models/language_models/bert.py index efb0f46bad6..427229b909c 100644 --- a/mmdet/models/language_models/bert.py +++ b/mmdet/models/language_models/bert.py @@ -55,10 +55,10 @@ def generate_masks_with_special_tokens_and_transfer_map( device=input_ids.device).bool().unsqueeze(0).repeat( bs, 1, 1)) position_ids = torch.zeros((bs, num_token), device=input_ids.device) - previous_col = 0 + previous_col = -1 for i in range(idxs.shape[0]): row, col = idxs[i] - if (col == 0) or (col == num_token - 1): + if col == 0: attention_mask[row, col, col] = True position_ids[row, col] = 0 else: @@ -68,6 +68,9 @@ def generate_masks_with_special_tokens_and_transfer_map( 0, col - previous_col, device=input_ids.device) previous_col = col + if i + 1 != idxs.shape[0] and idxs[i + 1][0] != idxs[i][0]: + previous_col = -1 + return attention_mask, position_ids.to(torch.long) @@ -143,7 +146,8 @@ def forward(self, captions: Sequence[str], **kwargs) -> dict: padding='max_length' if self.pad_to_max else 'longest', return_special_tokens_mask=True, return_tensors='pt', - truncation=True).to(device) + truncation=True, + add_special_tokens=kwargs.get("add_special_tokens", True)).to(device) input_ids = tokenized.input_ids if self.use_sub_sentence_represent: attention_mask, position_ids = \