Qsevent77 commited on 7 days ago

Commit

306867d

verified ·

1 Parent(s): 072a865

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.gitattributes +3 -0
README.md +491 -0
config.json +70 -0
config_sentence_transformers.json +12 -0
custom_st.py +275 -0
model.safetensors +3 -0
modules.json +14 -0
onnx/model.onnx +3 -0
onnx/model.onnx_data +3 -0
onnx/model_bnb4.onnx +3 -0
onnx/model_fp16.onnx +3 -0
onnx/model_int8.onnx +3 -0
onnx/model_q4.onnx +3 -0
onnx/model_q4f16.onnx +3 -0
onnx/model_quantized.onnx +3 -0
onnx/model_uint8.onnx +3 -0
preprocessor_config.json +22 -0
pytorch_model.bin +3 -0
special_tokens_map.json +51 -0
tokenizer.json +3 -0
tokenizer_config.json +54 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+*.original filter=lfs diff=lfs merge=lfs -text
+onnx/model.onnx_data filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,491 @@

+---
+base_model:
+- jinaai/xlm-roberta-flash-implementation
+language:
+- multilingual
+- af
+- am
+- ar
+- as
+- az
+- be
+- bg
+- bn
+- br
+- bs
+- ca
+- cs
+- cy
+- da
+- de
+- el
+- en
+- eo
+- es
+- et
+- eu
+- fa
+- fi
+- fr
+- fy
+- ga
+- gd
+- gl
+- gu
+- ha
+- he
+- hi
+- hr
+- hu
+- hy
+- id
+- is
+- it
+- ja
+- jv
+- ka
+- kk
+- km
+- kn
+- ko
+- ku
+- ky
+- la
+- lo
+- lt
+- lv
+- mg
+- mk
+- ml
+- mn
+- mr
+- ms
+- my
+- ne
+- nl
+- 'no'
+- om
+- or
+- pa
+- pl
+- ps
+- pt
+- ro
+- ru
+- sa
+- sd
+- si
+- sk
+- sl
+- so
+- sq
+- sr
+- su
+- sv
+- sw
+- ta
+- te
+- th
+- tl
+- tr
+- ug
+- uk
+- ur
+- uz
+- vi
+- xh
+- yi
+- zh
+library_name: transformers
+license: cc-by-nc-4.0
+tags:
+- xlm-roberta
+- eva02
+- clip
+- feature-extraction
+- sentence-similarity
+- retrieval
+- multimodal
+- multi-modal
+- crossmodal
+- cross-modal
+- mteb
+- clip-benchmark
+- vidore
+- transformers
+- sentence-transformers
+- onnx
+- safetensors
+- transformers.js
+inference: false
+---
+<br><br>
+<p align="center">
+<img src="https://huggingface.co/datasets/jinaai/documentation-images/resolve/main/logo.webp" alt="Jina AI: Your Search Foundation, Supercharged!" width="150px">
+</p>
+<p align="center">
+<b>The embedding set trained by <a href="https://jina.ai/"><b>Jina AI</b></a>.</b>
+</p>
+<p align="center">
+<b>Jina CLIP v2: Multilingual Multimodal Embeddings for Texts and Images</b>
+</p>
+This model is based on the paper [jina-clip-v2: Multilingual Multimodal Embeddings for Text and Images](https://huggingface.co/papers/2412.08802).
+## Quick Start
+[Blog](https://jina.ai/news/jina-clip-v2-multilingual-multimodal-embeddings-for-text-and-images) | [Technical Report](https://arxiv.org/abs/2412.08802) | [Azure](https://azuremarketplace.microsoft.com/en-gb/marketplace/apps/jinaai.jina-clip-v2-vm?tab=Overview) | [AWS SageMaker](https://aws.amazon.com/marketplace/pp/prodview-bfbctuqmky676) | [Google Cloud Platform](https://console.cloud.google.com/marketplace/browse?hl=en&inv=1&invt=AbiD-g&q=jina) | [API](https://jina.ai/embeddings)
+## Intended Usage & Model Info
+`jina-clip-v2` is a **general-purpose multilingual multimodal embedding model for text & images**.
+Multimodal embeddings enable searching and understanding data across different modalities through a coherent representation. They serve as the backbone of neural information retrieval and multimodal GenAI applications.
+Built upon [`jina-clip-v1`](https://huggingface.co/jinaai/jina-clip-v1) and our recently released [`jina-embeddings-v3`](https://huggingface.co/jinaai/jina-embeddings-v3), `jina-clip-v2` features several significant improvements:
+* **Improved Performance**: v2 shows a 3% performance improvement over v1 in both text-image and text-text retrieval tasks. Similar to v1, v2's text encoder can serve as an effective multilingual long-context dense retriever. It performs on par with our frontier model `jina-embeddings-v3` (currently the best multilingual embeddings under 1B parameters on MTEB).
+* **Multilingual Support**: Using the same backbone as `jina-embeddings-v3` for the text tower, `jina-clip-v2` supports 89 languages for multilingual-image retrieval, showing up to 4% improvement compared to `nllb-clip-large-siglip` on multilingual image retrieval tasks.
+* **Higher Image Resolution**: v2 now supports 512x512 input image resolution, a significant increase from v1's 224x224. This higher resolution enables better processing of detailed images, improved feature extraction, and more accurate recognition of fine-grained visual elements.
+* **Matryoshka Representations**: v2 allows users to truncate the output dimensions of both text and image embeddings from 1024 down to 64, reducing storage and processing overhead while maintaining strong performance.
+Measuring 0.9B parameters, `jina-clip-v2` combines two powerful encoders:
+* the text encoder `Jina-XLM-RoBERTa` (the backbone of `jina-embeddings-v3`) and
+* the vision encoder `EVA02-L14` (an efficient vision Transformer developed by BAAI).
+| FEATURE               | TEXT ENCODER            | IMAGE ENCODER    |
+|-----------------------|-------------------------|------------------|
+| Base Model	           | Jina-XLM-RoBERTa	       | EVA02-L          |
+| Parameters	           | 561M                    | 304M             |
+| Input Specification	  | 8,192 tokens (max)	     | 512×512 pixels   |
+| Min Output Dimensions | 64                      | 64               |
+| Max Output Dimensions | 1,024                   | 1,024            |
+| Layers	               | 24                      | 24               |
+| Attention Mechanism	  | FlashAttention2	        | xFormers         |
+| Pooling Strategy	     | Mean pooling	           | CLS pooling      |
+| Additional Features	  | 89 languages supported	 | Patch size 14x14 |
+These encoders are jointly trained to create aligned representations of images and text.
+CLIP-like models have established themselves as the backbone for general-purpose multimodal applications. With `jina-clip-v2`, we're taking these capabilities to the next level, breaking down language barriers to deliver more accurate cross-modal understanding and retrieval. We're confident this release delivers a promise in making multimodal search and retrieval both more powerful and more accessible to developers worldwide.
+## Training, Data, Parameters
+Please refer to our [technical report of jina-clip-v2](https://arxiv.org/abs/2412.08802) for the model and training details.
+[technical report of jina-clip-v1](https://arxiv.org/abs/2405.20204)
+## Faster Inference: FA2, XFormers and bf16
+On a CUDA enabled torch environment, the model comes in `torch.bfloat16`
+precision by default. It is highly recommended to install
+[FlashAttention](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features)
+and [xFormers](https://github.com/facebookresearch/xformers?tab=readme-ov-file#installing-xformers)
+to make use of their efficient attention mechanism implementations.
+## Usage
+<details>
+  <summary>via Jina AI <a href="https://jina.ai/embeddings/">Embedding API</a></summary>
+```bash
+curl https://api.jina.ai/v1/embeddings \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer [JINA_AI_API_TOKEN]" \
+  -d @- <<EOFEOF
+  {
+    "model": "jina-clip-v2",
+    "dimensions": 1024,
+    "task": "retrieval.query",
+    "normalized": true,
+    "embedding_type": "float",
+    "input": [
+        {
+            "text": "غروب جميل على الشاطئ"
+        },
+        {
+            "text": "海滩上美丽的日落"
+        },
+        {
+            "text": "A beautiful sunset over the beach"
+        },
+        {
+            "text": "Un beau coucher de soleil sur la plage"
+        },
+        {
+            "text": "Ein wunderschöner Sonnenuntergang am Strand"
+        },
+        {
+            "text": "Ένα όμορφο ηλιοβασίλεμα πάνω από την παραλία"
+        },
+        {
+            "text": "समुद्र तट पर एक खूबसूरत सूर्यास्त"
+        },
+        {
+            "text": "Un bellissimo tramonto sulla spiaggia"
+        },
+        {
+            "text": "浜辺に沈む美しい夕日"
+        },
+        {
+            "text": "해변 위로 아름다운 일몰"
+        },
+        {
+            "image": "https://i.ibb.co/nQNGqL0/beach1.jpg"
+        },
+        {
+            "image": "https://i.ibb.co/r5w8hG8/beach2.jpg"
+        }
+    ]
+  }
+EOFEOF
+```
+</details>
+<details>
+  <summary>via <a href="https://huggingface.co/docs/transformers/en/index">transformers</a></summary>
+```python
+# !pip install transformers einops timm pillow
+from transformers import AutoModel
+# Initialize the model
+model = AutoModel.from_pretrained('jinaai/jina-clip-v2', trust_remote_code=True)
+# Corpus
+sentences = [
+    'غروب جميل على الشاطئ', # Arabic
+    '海滩上美丽的日落', # Chinese
+    'Un beau coucher de soleil sur la plage', # French
+    'Ein wunderschöner Sonnenuntergang am Strand', # German
+    'Ένα όμορφο ηλιοβασίλεμα πάνω από την παραλία', # Greek
+    'समुद्र तट पर एक खूबसूरत सूर्यास्त', # Hindi
+    'Un bellissimo tramonto sulla spiaggia', # Italian
+    '浜辺に沈む美しい夕日', # Japanese
+    '해변 위로 아름다운 일몰', # Korean
+]
+# Public image URLs or PIL Images
+image_urls = ['https://i.ibb.co/nQNGqL0/beach1.jpg', 'https://i.ibb.co/r5w8hG8/beach2.jpg']
+# Choose a matryoshka dimension, set to None to get the full 1024-dim vectors
+truncate_dim = 512
+# Encode text and images
+text_embeddings = model.encode_text(sentences, truncate_dim=truncate_dim)
+image_embeddings = model.encode_image(
+    image_urls, truncate_dim=truncate_dim
+)  # also accepts PIL.Image.Image, local filenames, dataURI
+# Encode query text
+query = 'beautiful sunset over the beach' # English
+query_embeddings = model.encode_text(
+    query, task='retrieval.query', truncate_dim=truncate_dim
+)
+# Text to Image
+print('En -> Img: ' + str(query_embeddings @ image_embeddings[0].T))
+# Image to Image
+print('Img -> Img: ' + str(image_embeddings[0] @ image_embeddings[1].T))
+# Text to Text
+print('En -> Ar: ' + str(query_embeddings @ text_embeddings[0].T))
+print('En -> Zh: ' + str(query_embeddings @ text_embeddings[1].T))
+print('En -> Fr: ' + str(query_embeddings @ text_embeddings[2].T))
+print('En -> De: ' + str(query_embeddings @ text_embeddings[3].T))
+print('En -> Gr: ' + str(query_embeddings @ text_embeddings[4].T))
+print('En -> Hi: ' + str(query_embeddings @ text_embeddings[5].T))
+print('En -> It: ' + str(query_embeddings @ text_embeddings[6].T))
+print('En -> Jp: ' + str(query_embeddings @ text_embeddings[7].T))
+print('En -> Ko: ' + str(query_embeddings @ text_embeddings[8].T))
+```
+</details>
+<details>
+  <summary>via <a href="https://sbert.net/">sentence-transformers</a></summary>
+```python
+# !pip install sentence-transformers einops timm pillow
+from sentence_transformers import SentenceTransformer
+# Choose a matryoshka dimension
+truncate_dim = 512
+# Initialize the model
+model = SentenceTransformer(
+    'jinaai/jina-clip-v2', trust_remote_code=True, truncate_dim=truncate_dim
+)
+# Corpus
+sentences = [
+    'غروب جميل على الشاطئ', # Arabic
+    '海滩上美丽的日落', # Chinese
+    'Un beau coucher de soleil sur la plage', # French
+    'Ein wunderschöner Sonnenuntergang am Strand', # German
+    'Ένα όμορφο ηλιοβασίλεμα πάνω από την παραλία', # Greek
+    'समुद्र तट पर एक खूबसूरत सूर्यास्त', # Hindi
+    'Un bellissimo tramonto sulla spiaggia', # Italian
+    '浜辺に沈む美しい夕日', # Japanese
+    '해변 위로 아름다운 일몰', # Korean
+]
+# Public image URLs or PIL Images
+image_urls = ['https://i.ibb.co/nQNGqL0/beach1.jpg', 'https://i.ibb.co/r5w8hG8/beach2.jpg']
+# Encode text and images
+text_embeddings = model.encode(sentences, normalize_embeddings=True)
+image_embeddings = model.encode(
+    image_urls, normalize_embeddings=True
+)  # also accepts PIL.Image.Image, local filenames, dataURI
+# Encode query text
+query = 'beautiful sunset over the beach' # English
+query_embeddings = model.encode(
+    query, prompt_name='retrieval.query', normalize_embeddings=True
+)
+```
+</details>
+<details>
+  <summary>via <a href="https://huggingface.co/docs/transformers.js/en/index">transformers.js</a></summary>
+> [!NOTE]
+> JinaCLIP was added in Transformers.js v3.1.0, so make sure you're using a compatible version!
+> See the [release notes](https://github.com/huggingface/transformers.js/releases/tag/3.1.0) for more information.
+If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@huggingface/transformers) using:
+```bash
+npm i @huggingface/transformers
+```
+**Example:** Compute text and/or image embeddings with `jinaai/jina-clip-v2`:
+```js
+import { AutoModel, AutoProcessor, RawImage, matmul } from "@huggingface/transformers";
+// Load processor and model
+const model_id = "jinaai/jina-clip-v2";
+const processor = await AutoProcessor.from_pretrained(model_id);
+const model = await AutoModel.from_pretrained(model_id, { dtype: "q4" /* e.g., "fp16", "q8", or "q4" */ });
+// Prepare inputs
+const urls = ["https://i.ibb.co/nQNGqL0/beach1.jpg", "https://i.ibb.co/r5w8hG8/beach2.jpg"];
+const images = await Promise.all(urls.map(url => RawImage.read(url)));
+const sentences = [
+    "غروب جميل على الشاطئ", // Arabic
+    "海滩上美丽的日落", // Chinese
+    "Un beau coucher de soleil sur la plage", // French
+    "Ein wunderschöner Sonnenuntergang am Strand", // German
+    "Ένα όμορφο ηλιοβασίλεμα πάνω από την παραλία", // Greek
+    "समुद्र तट पर एक खूबसूरत सूर्यास्त", // Hindi
+    "Un bellissimo tramonto sulla spiaggia", // Italian
+    "浜辺に沈む美しい夕日", // Japanese
+    "해변 위로 아름다운 일몰", // Korean
+];
+// Encode text and images
+const inputs = await processor(sentences, images, { padding: true, truncation: true });
+const { l2norm_text_embeddings, l2norm_image_embeddings } = await model(inputs);
+// Encode query (text-only)
+const query_prefix = "Represent the query for retrieving evidence documents: ";
+const query_inputs = await processor(query_prefix + "beautiful sunset over the beach");
+const { l2norm_text_embeddings: query_embeddings } = await model(query_inputs);
+// Compute text-image similarity scores
+const text_to_image_scores = await matmul(query_embeddings, l2norm_image_embeddings.transpose(1, 0));
+console.log("text-image similarity scores", text_to_image_scores.tolist()[0]); // [0.29530206322669983, 0.3183615803718567]
+// Compute image-image similarity scores
+const image_to_image_score = await matmul(l2norm_image_embeddings[0], l2norm_image_embeddings[1]);
+console.log("image-image similarity score", image_to_image_score.item()); // 0.9344457387924194
+// Compute text-text similarity scores
+const text_to_text_scores = await matmul(query_embeddings, l2norm_text_embeddings.transpose(1, 0));
+console.log("text-text similarity scores", text_to_text_scores.tolist()[0]); // [0.5566609501838684, 0.7028406858444214, 0.582255482673645, 0.6648036241531372, 0.5462006330490112, 0.6791588068008423, 0.6192430257797241, 0.6258729100227356, 0.6453716158866882]
+```
+</details>
+<details>
+  <summary>via the <a href="https://onnxruntime.ai/">ONNX Runtime</a></summary>
+```python
+# !pip install transformers onnxruntime pillow
+import onnxruntime as ort
+from transformers import AutoImageProcessor, AutoTokenizer
+# Load tokenizer and image processor using transformers
+tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-clip-v2', trust_remote_code=True)
+image_processor = AutoImageProcessor.from_pretrained(
+    'jinaai/jina-clip-v2', trust_remote_code=True
+)
+# Corpus
+sentences = [
+    'غروب جميل على الشاطئ', # Arabic
+    '海滩上美丽的日落', # Chinese
+    'Un beau coucher de soleil sur la plage', # French
+    'Ein wunderschöner Sonnenuntergang am Strand', # German
+    'Ένα όμορφο ηλιοβασίλεμα πάνω από την παραλία', # Greek
+    'समुद्र तट पर एक खूबसूरत सूर्यास्त', # Hindi
+    'Un bellissimo tramonto sulla spiaggia', # Italian
+    '浜辺に沈む美しい夕日', # Japanese
+    '해변 위로 아름다운 일몰', # Korean
+]
+# Public image URLs or PIL Images
+image_urls = ['https://i.ibb.co/nQNGqL0/beach1.jpg', 'https://i.ibb.co/r5w8hG8/beach2.jpg']
+# Tokenize input texts and transform input images
+input_ids = tokenizer(sentences, return_tensors='np')['input_ids']
+pixel_values = image_processor(image_urls)['pixel_values']
+# Start an ONNX Runtime Session
+session = ort.InferenceSession('jina-clip-v2/onnx/model.onnx')
+# Run inference
+output = session.run(None, {'input_ids': input_ids, 'pixel_values': pixel_values})
+# Keep the normalised embeddings, first 2 outputs are un-normalized
+_, _, text_embeddings, image_embeddings = output
+```
+</details>
+## License
+This model is licensed to download and run under [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/deed.en). It is available for commercial use via the [Jina Embeddings API](https://jina.ai/embeddings/), [AWS](https://aws.amazon.com/marketplace/pp/prodview-bfbctuqmky676), [Azure](https://azuremarketplace.microsoft.com/en-gb/marketplace/apps/jinaai.jina-clip-v2-vm?tab=Overview), and [GCP](https://console.cloud.google.com/marketplace/browse?hl=en&inv=1&invt=AbiFWQ&q=jina). To download for commercial use, please [contact us](https://jina.ai/contact-sales).
+## Contact
+Join our [Discord community](https://discord.jina.ai) and chat with other community members about ideas.
+## Citation
+If you find `jina-clip-v2` useful in your research, please cite the following paper:
+```bibtex
+@misc{koukounas2024jinaclipv2multilingualmultimodalembeddings,
+      title={jina-clip-v2: Multilingual Multimodal Embeddings for Text and Images},
+      author={Andreas Koukounas and Georgios Mastrapas and Bo Wang and Mohammad Kalim Akram and Sedigheh Eslami and Michael Günther and Isabelle Mohr and Saba Sturua and Scott Martens and Nan Wang and Han Xiao},
+      year={2024},
+      eprint={2412.08802},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2412.08802},
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+    "add_projections": false,
+    "architectures": [
+        "JinaCLIPModel"
+    ],
+    "auto_map": {
+        "AutoConfig": "jinaai/jina-clip-implementation--configuration_clip.JinaCLIPConfig",
+        "AutoModel": "jinaai/jina-clip-implementation--modeling_clip.JinaCLIPModel"
+    },
+    "initializer_factor": 1.0,
+    "logit_scale_init_value": 2.6592,
+    "matryoshka_dimensions": [32, 64, 128, 256, 512, 768, 1024],
+    "model_type": "jina_clip",
+    "projection_dim": 1024,
+    "text_config": {
+        "default_instruction_task": null,
+        "default_lora_task": "retrieval.query",
+        "embed_dim": 1024,
+        "hf_model_config_kwargs": {
+            "load_trained_adapters": false,
+            "lora_adaptations": [
+                "retrieval.query"
+            ],
+            "lora_alpha": 4,
+            "lora_dropout_p": 0.0,
+            "lora_main_params_trainable": false,
+            "lora_rank": 4,
+            "task_instructions": {
+                "retrieval.query": "Represent the query for retrieving evidence documents: "
+            },
+            "use_flash_attn": true
+        },
+        "hf_model_name_or_path": "jinaai/jina-embeddings-v3",
+        "model_type": "jina_clip_text",
+        "pooler_type": "mean_pooler",
+        "proj_bias": false,
+        "proj_type": null
+    },
+    "torch_dtype": "bfloat16",
+    "transformers.js_config": {
+        "use_external_data_format": {
+            "model.onnx": true
+        }
+    },
+    "truncate_dim": null,
+    "use_text_flash_attn": null,
+    "use_vision_xformers": null,
+    "vision_config": {
+        "embed_dim": 1024,
+        "fused_layer_norm": false,
+        "head_width": 64,
+        "image_size": 512,
+        "intp_freq": true,
+        "layers": 24,
+        "ls_init_value": null,
+        "mlp_ratio": 2.6667,
+        "model_type": "jina_clip_vision",
+        "naive_swiglu": true,
+        "patch_dropout": 0.1,
+        "patch_size": 14,
+        "post_norm": false,
+        "proj_type": null,
+        "pt_hw_seq_len": 16,
+        "qkv_bias": true,
+        "rope_embeddings": true,
+        "subln": true,
+        "width": 1024,
+        "x_attention": true
+    }
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "__version__": {
+    "sentence_transformers": "3.3.0",
+    "transformers": "4.46.2",
+    "pytorch": "2.2.2"
+  },
+  "prompts":{
+    "retrieval.query":"Represent the query for retrieving evidence documents: "
+  },
+  "default_prompt_name": null,
+  "similarity_fn_name": "cosine"
+}

custom_st.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import base64
+import json
+import os
+from io import BytesIO
+from typing import Any, Dict, List, Literal, Optional, Union
+import requests
+import torch
+from PIL import Image
+from torch import nn
+from transformers import AutoConfig, AutoImageProcessor, AutoModel, AutoTokenizer
+class Transformer(nn.Module):
+    save_in_root: bool = True
+    def __init__(
+        self,
+        model_name_or_path: str = 'jinaai/jina-clip-v2',
+        tokenizer_name_or_path: Optional[str] = None,
+        image_processor_name_or_path: Optional[str] = None,
+        max_seq_length: Optional[int] = None,
+        config_args: Optional[Dict[str, Any]] = None,
+        model_args: Optional[Dict[str, Any]] = None,
+        tokenizer_args: Optional[Dict[str, Any]] = None,
+        image_processor_args: Optional[Dict[str, Any]] = None,
+        assume_text_inputs: bool = False,
+        cache_dir: Optional[str] = None,
+        backend: Literal['torch', 'onnx', 'openvino'] = 'torch',
+        **_,
+    ) -> None:
+        """
+        Creates a custom SentenceTransformer module that uses `jinai/jina-clip-v2` to
+        map sentences/images to embeddings
+        Args:
+            model_name_or_path (str, optional): If it is a filepath on disc, it loads
+                the model from that path. If it is not a path, tries to construct a
+                model from the Hugging Face Hub with that name. Defaults to
+                'jinaai/jina-clip-v2'
+            tokenizer_name_or_path (str, optional): If it is a filepath on disc, it
+                loads the tokenizer from that path. If it is not a path, tries to
+                construct a tokenizer from the Hugging Face Hub with that name.
+                If `None` it is automatically set to the value of `model_name_or_path`
+            image_processor_name_or_path (str, optional): If it is a filepath on disc,
+                it loads the image processor from that path. If it is not a path, tries
+                to construct an image processor from the Hugging Face Hub with that
+                name. If `None` it is automatically set to the value of
+                `model_name_or_path`
+            max_seq_length (int, optional): The maximum sequence length of the model.
+                If not provided, will be inferred from model or tokenizer
+            config_args (Dict[str, Any], optional): Additional model configuration
+                parameters to be passed to the Hugging Face Transformers config
+            model_args (Dict[str, Any], optional): Additional model configuration
+                parameters to be passed to the Hugging Face Transformers model
+            tokenizer_args (Dict[str, Any], optional): Additional tokenizer
+                configuration parameters to be passed to the Hugging Face Transformers
+                tokenizer
+            image_processor_args (Dict[str, Any], optional): Additional image processor
+                configuration parameters to be passed to the Hugging Face Transformers
+                image processor
+            assume_text_inputs (bool, optional): If set to `True`, all inputs are
+                treated as texts. Defaults to `False`
+            cache_dir (str, optional): The Hugging Face Hub cache directory
+            backend (str, optional): Computational backend, only 'torch' is supported
+        Example:
+            ::
+                from sentence_transformers import SentenceTransformer
+                model = SentenceTransformer(
+                    'jinaai/jina-clip-v2', trust_remote_code=True
+                )
+                sentences_or_images = [
+                    "The weather is lovely today.",
+                    "It's so sunny outside!",
+                    "/path/to/stadium.jpg",
+                ]
+                embeddings = model.encode(sentences_or_images)
+                print(embeddings.shape)
+                # (3, 1024)
+                # Get the similarity scores between all inputs
+                similarities = model.similarity(embeddings, embeddings)
+                print(similarities)
+                # tensor([[1.0000, 0.6817, 0.0492],
+                #         [0.6817, 1.0000, 0.0421],
+                #         [0.0492, 0.0421, 1.0000]])
+        """
+        super(Transformer, self).__init__()
+        if backend != 'torch':
+            raise ValueError(
+                f'Backend \'{backend}\' is not supported, please use \'torch\' instead'
+            )
+        config_kwargs = config_args or {}
+        model_kwargs = model_args or {}
+        tokenizer_kwargs = tokenizer_args or {}
+        image_processor_kwargs = {
+            'token': model_kwargs.get('token', None),
+            'trust_remote_code': model_kwargs.get('trust_remote_code', False),
+            'revision': model_kwargs.get('revision', None),
+            'local_files_only': model_kwargs.get('local_files_only', None),
+        }
+        image_processor_kwargs.update(image_processor_args or {})
+        config = AutoConfig.from_pretrained(
+            model_name_or_path, cache_dir=cache_dir, **config_kwargs
+        )
+        self.model = AutoModel.from_pretrained(
+            model_name_or_path, config=config, cache_dir=cache_dir, **model_kwargs
+        )
+        if max_seq_length is not None and 'model_max_length' not in tokenizer_kwargs:
+            tokenizer_kwargs['model_max_length'] = max_seq_length
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path or model_name_or_path,
+            cache_dir=cache_dir,
+            **tokenizer_kwargs,
+        )
+        self.image_processor = AutoImageProcessor.from_pretrained(
+            image_processor_name_or_path or model_name_or_path,
+            cache_dir=cache_dir,
+            **image_processor_kwargs,
+        )
+        self.assume_text_inputs = assume_text_inputs
+        # No max_seq_length set. Try to infer from model
+        if max_seq_length is None:
+            if (
+                hasattr(self.model, 'config')
+                and hasattr(self.model.config, 'max_position_embeddings')
+                and hasattr(self.tokenizer, 'model_max_length')
+            ):
+                max_seq_length = min(
+                    self.model.config.max_position_embeddings,
+                    self.tokenizer.model_max_length,
+                )
+        self.max_seq_length = max_seq_length
+        if tokenizer_name_or_path is not None:
+            self.model.config.tokenizer_class = self.tokenizer.__class__.__name__
+    @staticmethod
+    def _decode_data_image(data_image_str: str) -> Image.Image:
+        header, data = data_image_str.split(',', 1)
+        image_data = base64.b64decode(data)
+        return Image.open(BytesIO(image_data))
+    def tokenize(
+        self, texts: List[Union[str, Image.Image]], padding: Union[str, bool] = True
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Encodes input samples. Text samples are tokenized. Image URLs, image data
+        buffers and PIL images are passed through the image processor.
+        """
+        _images = []
+        _texts = []
+        _image_or_text_descriptors = []
+        if self.assume_text_inputs:
+            for sample in texts:
+                if isinstance(sample, str):
+                    _texts.append(sample)
+                    _image_or_text_descriptors.append(1)
+        else:
+            for sample in texts:
+                if isinstance(sample, str):
+                    if sample.startswith('http'):
+                        try:
+                            response = requests.get(sample)
+                            _images.append(
+                                Image.open(BytesIO(response.content)).convert('RGB')
+                            )
+                            _image_or_text_descriptors.append(0)
+                        except Exception as e:
+                            _ = str(e)
+                            _texts.append(sample)
+                            _image_or_text_descriptors.append(1)
+                    elif sample.startswith('data:image/'):
+                        _images.append(self._decode_data_image(sample).convert('RGB'))
+                        _image_or_text_descriptors.append(0)
+                    else:
+                        try:
+                            _images.append(Image.open(sample).convert('RGB'))
+                            _image_or_text_descriptors.append(0)
+                        except Exception as e:
+                            _ = str(e)
+                            _texts.append(sample)
+                            _image_or_text_descriptors.append(1)
+                elif isinstance(sample, Image.Image):
+                    _images.append(sample.convert('RGB'))
+                    _image_or_text_descriptors.append(0)
+        encoding = {}
+        if len(_texts):
+            encoding['input_ids'] = self.tokenizer(
+                _texts,
+                padding=padding,
+                truncation='longest_first',
+                return_tensors='pt',
+                max_length=self.max_seq_length,
+            ).input_ids
+        if len(_images):
+            encoding['pixel_values'] = self.image_processor(
+                _images, return_tensors='pt'
+            ).pixel_values
+        encoding['image_text_info'] = _image_or_text_descriptors
+        return encoding
+    def forward(self, features: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        image_embeddings = []
+        text_embeddings = []
+        if 'pixel_values' in features:
+            image_embeddings = self.model.get_image_features(features['pixel_values'])
+        if 'input_ids' in features:
+            text_embeddings = self.model.get_text_features(features['input_ids'])
+        sentence_embedding = []
+        image_features = iter(image_embeddings)
+        text_features = iter(text_embeddings)
+        for _, _input_type in enumerate(features['image_text_info']):
+            if _input_type == 0:
+                sentence_embedding.append(next(image_features))
+            else:
+                sentence_embedding.append(next(text_features))
+        features['sentence_embedding'] = torch.stack(sentence_embedding).float()
+        return features
+    def save(self, output_path: str, safe_serialization: bool = True) -> None:
+        self.model.save_pretrained(output_path, safe_serialization=safe_serialization)
+        self.tokenizer.save_pretrained(output_path)
+        self.image_processor.save_pretrained(output_path)
+    @staticmethod
+    def load(input_path: str) -> 'Transformer':
+        # Old classes used other config names than 'sentence_bert_config.json'
+        for config_name in [
+            'sentence_bert_config.json',
+            'sentence_roberta_config.json',
+            'sentence_distilbert_config.json',
+            'sentence_camembert_config.json',
+            'sentence_albert_config.json',
+            'sentence_xlm-roberta_config.json',
+            'sentence_xlnet_config.json',
+        ]:
+            sbert_config_path = os.path.join(input_path, config_name)
+            if os.path.exists(sbert_config_path):
+                break
+        with open(sbert_config_path) as fIn:
+            config = json.load(fIn)
+        # Don't allow configs to set trust_remote_code
+        if 'config_kwargs' in config and 'trust_remote_code' in config['config_kwargs']:
+            config['config_kwargs'].pop('trust_remote_code')
+        if 'model_kwargs' in config and 'trust_remote_code' in config['model_kwargs']:
+            config['model_kwargs'].pop('trust_remote_code')
+        if (
+            'tokenizer_kwargs' in config
+            and 'trust_remote_code' in config['tokenizer_kwargs']
+        ):
+            config['tokenizer_kwargs'].pop('trust_remote_code')
+        if (
+            'image_processor_kwargs' in config
+            and 'trust_remote_code' in config['image_processor_kwargs']
+        ):
+            config['image_processor_kwargs'].pop('trust_remote_code')
+        return Transformer(model_name_or_path=input_path, **config)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eff4c0a13ab4de71a9927a56968fef44e626920ff935e503f1bd3e6ec797062d
+size 1730688642

modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+    {
+        "idx": 0,
+        "name": "transformer",
+        "path": "",
+        "type": "custom_st.Transformer"
+    },
+    {
+        "idx": 1,
+        "name": "normalizer",
+        "path": "1_Normalize",
+        "type": "sentence_transformers.models.Normalize"
+    }
+]

onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8bdc8a8124e10305fa97140ba11902e06b91f8a4dcf13a1664da521cdc155ed
+size 2090152

onnx/model.onnx_data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ec73a3b6f33c472249f7b058c3cbfb9586483b88ee5930b3b3749dff7acd873
+size 3453550848

onnx/model_bnb4.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1029184ed517000825a39faa6ff154c68b98ef06a67dd2f88b3269ef69fec55f
+size 1379631302

onnx/model_fp16.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:746a78209096d1cd52891b70d752903b8bf86088ba847bd0c56c03fb29256801
+size 1728814880

onnx/model_int8.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21b8b77a009865faecaa29f076ee55d6334ea42699a9efa14d542ce8d3938a3f
+size 874350932

onnx/model_q4.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:032967b983b9cea6fb94811f4b6fe1986deff89d13ee794a1f3b124df711f5c5
+size 1417712750

onnx/model_q4f16.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a753f9cdd4061cdd4342beeafeb442b2e5b562b49bb47d3bebedad7693fa602
+size 861019483

onnx/model_quantized.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65c6423fc82eecffb7f7f813730c6a6f0d28e2dc908e414250733b1416ed30bf
+size 874351078

onnx/model_uint8.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65c6423fc82eecffb7f7f813730c6a6f0d28e2dc908e414250733b1416ed30bf
+size 874351078

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "auto_map": {
+        "AutoImageProcessor": "jinaai/jina-clip-implementation--processing_clip.JinaCLIPImageProcessor",
+        "AutoProcessor": "jinaai/jina-clip-implementation--processing_clip.JinaCLIPProcessor"
+    },
+    "fill_color": 0,
+    "image_processor_type": "JinaCLIPImageProcessor",
+    "interpolation": "bicubic",
+    "mean": [
+        0.48145466,
+        0.4578275,
+        0.40821073
+    ],
+    "processor_class": "JinaCLIPProcessor",
+    "resize_mode": "shortest",
+    "size": 512,
+    "std": [
+        0.26862954,
+        0.26130258,
+        0.27577711
+    ]
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8721cced553dc439b45c1dfd30d36d7535ab93f92e38aae7fa36f4380ffdd11d
+size 1730896230

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6601c4120779a1a3863897ba332fe3481d548e363bec2c91eba10ef8640a5e93
+size 17082997

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "model_max_length": 8194,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}