Upload processing_r.py with huggingface_hub
Browse files- processing_r.py +44 -2
processing_r.py
CHANGED
|
@@ -20,7 +20,7 @@ import numpy as np
|
|
| 20 |
from transformers.feature_extraction_utils import BatchFeature
|
| 21 |
from transformers.image_processing_utils import select_best_resolution
|
| 22 |
from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
|
| 23 |
-
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
| 24 |
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
| 25 |
from transformers.utils import logging
|
| 26 |
|
|
@@ -190,6 +190,48 @@ class RProcessor(ProcessorMixin):
|
|
| 190 |
return (unpadded_features, newline_features)
|
| 191 |
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
| 194 |
def batch_decode(self, *args, **kwargs):
|
| 195 |
"""
|
|
@@ -214,4 +256,4 @@ class RProcessor(ProcessorMixin):
|
|
| 214 |
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
| 215 |
|
| 216 |
|
| 217 |
-
__all__ = ["RProcessor"]
|
|
|
|
| 20 |
from transformers.feature_extraction_utils import BatchFeature
|
| 21 |
from transformers.image_processing_utils import select_best_resolution
|
| 22 |
from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
|
| 23 |
+
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, MultiModalData
|
| 24 |
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
| 25 |
from transformers.utils import logging
|
| 26 |
|
|
|
|
| 190 |
return (unpadded_features, newline_features)
|
| 191 |
|
| 192 |
|
| 193 |
+
def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
|
| 194 |
+
"""
|
| 195 |
+
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
|
| 196 |
+
Args:
|
| 197 |
+
image_sizes (list[list[str]], *optional*):
|
| 198 |
+
The input sizes formatted as (height, width) per each image.
|
| 199 |
+
video_sizes (list[list[str]], *optional*):
|
| 200 |
+
The input sizes formatted as (num_frames, height, width) per each video.
|
| 201 |
+
audio_lengths (list[int], *optional*):
|
| 202 |
+
The input length formatted as per each audio.
|
| 203 |
+
Returns:
|
| 204 |
+
dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
|
| 205 |
+
to a list containing the number of placeholder tokens required. If the model doesn't accept
|
| 206 |
+
a certain modality or no input sizes are provided, the dict value is set to an empty list.
|
| 207 |
+
"""
|
| 208 |
+
vision_data = {}
|
| 209 |
+
if image_sizes is not None:
|
| 210 |
+
images_kwargs = RProcessorKwargs._defaults.get("images_kwargs", {})
|
| 211 |
+
images_kwargs.update(kwargs)
|
| 212 |
+
|
| 213 |
+
size = images_kwargs.get("size", None) or self.image_processor.size
|
| 214 |
+
size = (
|
| 215 |
+
(size["shortest_edge"], size["shortest_edge"])
|
| 216 |
+
if "shortest_edge" in size
|
| 217 |
+
else (min(size["height"], size["width"]), min(size["height"], size["width"]))
|
| 218 |
+
)
|
| 219 |
+
processed_height, processed_width = size
|
| 220 |
+
|
| 221 |
+
batch_num_image_tokens = []
|
| 222 |
+
num_image_patches = [1] * len(image_sizes) # llava-ov doesn't batch pixels as Idefics, thus `1` patch`
|
| 223 |
+
for image_size in image_sizes:
|
| 224 |
+
orig_height, orig_width = image_size
|
| 225 |
+
num_image_tokens = self._get_number_of_features(
|
| 226 |
+
orig_height, orig_width, processed_height, processed_width
|
| 227 |
+
)
|
| 228 |
+
if self.vision_feature_select_strategy == "default":
|
| 229 |
+
num_image_tokens -= 1
|
| 230 |
+
batch_num_image_tokens.append(num_image_tokens)
|
| 231 |
+
vision_data.update({"num_image_tokens": batch_num_image_tokens, "num_image_patches": num_image_patches})
|
| 232 |
+
|
| 233 |
+
return MultiModalData(**vision_data)
|
| 234 |
+
|
| 235 |
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
| 236 |
def batch_decode(self, *args, **kwargs):
|
| 237 |
"""
|
|
|
|
| 256 |
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
| 257 |
|
| 258 |
|
| 259 |
+
__all__ = ["RProcessor"]
|