rm rf remote code and add notice
Browse files
README.md
CHANGED
|
@@ -2619,6 +2619,8 @@ language:
|
|
| 2619 |
|
| 2620 |
For example, if you are implementing a RAG application, you embed your documents as `search_document: <text here>` and embed your user queries as `search_query: <text here>`.
|
| 2621 |
|
|
|
|
|
|
|
| 2622 |
## Task instruction prefixes
|
| 2623 |
|
| 2624 |
### `search_document`
|
|
@@ -2630,7 +2632,7 @@ This prefix is used for embedding texts as documents, for example as documents f
|
|
| 2630 |
```python
|
| 2631 |
from sentence_transformers import SentenceTransformer
|
| 2632 |
|
| 2633 |
-
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5"
|
| 2634 |
sentences = ['search_document: TSNE is a dimensionality reduction algorithm created by Laurens van Der Maaten']
|
| 2635 |
embeddings = model.encode(sentences)
|
| 2636 |
print(embeddings)
|
|
@@ -2645,7 +2647,7 @@ This prefix is used for embedding texts as questions that documents from a datas
|
|
| 2645 |
```python
|
| 2646 |
from sentence_transformers import SentenceTransformer
|
| 2647 |
|
| 2648 |
-
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5"
|
| 2649 |
sentences = ['search_query: Who is Laurens van Der Maaten?']
|
| 2650 |
embeddings = model.encode(sentences)
|
| 2651 |
print(embeddings)
|
|
@@ -2660,7 +2662,7 @@ This prefix is used for embedding texts in order to group them into clusters, di
|
|
| 2660 |
```python
|
| 2661 |
from sentence_transformers import SentenceTransformer
|
| 2662 |
|
| 2663 |
-
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5"
|
| 2664 |
sentences = ['clustering: the quick brown fox']
|
| 2665 |
embeddings = model.encode(sentences)
|
| 2666 |
print(embeddings)
|
|
@@ -2675,7 +2677,7 @@ This prefix is used for embedding texts into vectors that will be used as featur
|
|
| 2675 |
```python
|
| 2676 |
from sentence_transformers import SentenceTransformer
|
| 2677 |
|
| 2678 |
-
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5"
|
| 2679 |
sentences = ['classification: the quick brown fox']
|
| 2680 |
embeddings = model.encode(sentences)
|
| 2681 |
print(embeddings)
|
|
@@ -2689,7 +2691,7 @@ from sentence_transformers import SentenceTransformer
|
|
| 2689 |
|
| 2690 |
matryoshka_dim = 512
|
| 2691 |
|
| 2692 |
-
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5"
|
| 2693 |
sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
|
| 2694 |
embeddings = model.encode(sentences, convert_to_tensor=True)
|
| 2695 |
embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
|
|
@@ -2713,7 +2715,7 @@ def mean_pooling(model_output, attention_mask):
|
|
| 2713 |
sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
|
| 2714 |
|
| 2715 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
| 2716 |
-
model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5'
|
| 2717 |
model.eval()
|
| 2718 |
|
| 2719 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
|
@@ -2736,9 +2738,9 @@ The model natively supports scaling of the sequence length past 2048 tokens. To
|
|
| 2736 |
- tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
| 2737 |
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', model_max_length=8192)
|
| 2738 |
|
| 2739 |
-
|
| 2740 |
-
|
| 2741 |
-
+ model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5',
|
| 2742 |
```
|
| 2743 |
|
| 2744 |
### Transformers.js
|
|
|
|
| 2619 |
|
| 2620 |
For example, if you are implementing a RAG application, you embed your documents as `search_document: <text here>` and embed your user queries as `search_query: <text here>`.
|
| 2621 |
|
| 2622 |
+
**Notice**: From transformers v5.5.0 and sentence transformers v5.3.0, `trust_remote_code=True` will no longer be necessary. This will only be possible with the text-only series as of now.
|
| 2623 |
+
|
| 2624 |
## Task instruction prefixes
|
| 2625 |
|
| 2626 |
### `search_document`
|
|
|
|
| 2632 |
```python
|
| 2633 |
from sentence_transformers import SentenceTransformer
|
| 2634 |
|
| 2635 |
+
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5")
|
| 2636 |
sentences = ['search_document: TSNE is a dimensionality reduction algorithm created by Laurens van Der Maaten']
|
| 2637 |
embeddings = model.encode(sentences)
|
| 2638 |
print(embeddings)
|
|
|
|
| 2647 |
```python
|
| 2648 |
from sentence_transformers import SentenceTransformer
|
| 2649 |
|
| 2650 |
+
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5")
|
| 2651 |
sentences = ['search_query: Who is Laurens van Der Maaten?']
|
| 2652 |
embeddings = model.encode(sentences)
|
| 2653 |
print(embeddings)
|
|
|
|
| 2662 |
```python
|
| 2663 |
from sentence_transformers import SentenceTransformer
|
| 2664 |
|
| 2665 |
+
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5")
|
| 2666 |
sentences = ['clustering: the quick brown fox']
|
| 2667 |
embeddings = model.encode(sentences)
|
| 2668 |
print(embeddings)
|
|
|
|
| 2677 |
```python
|
| 2678 |
from sentence_transformers import SentenceTransformer
|
| 2679 |
|
| 2680 |
+
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5")
|
| 2681 |
sentences = ['classification: the quick brown fox']
|
| 2682 |
embeddings = model.encode(sentences)
|
| 2683 |
print(embeddings)
|
|
|
|
| 2691 |
|
| 2692 |
matryoshka_dim = 512
|
| 2693 |
|
| 2694 |
+
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5")
|
| 2695 |
sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
|
| 2696 |
embeddings = model.encode(sentences, convert_to_tensor=True)
|
| 2697 |
embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
|
|
|
|
| 2715 |
sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
|
| 2716 |
|
| 2717 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
| 2718 |
+
model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5')
|
| 2719 |
model.eval()
|
| 2720 |
|
| 2721 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
|
|
|
| 2738 |
- tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
| 2739 |
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', model_max_length=8192)
|
| 2740 |
|
| 2741 |
+
- model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5')
|
| 2742 |
+
+ rope_parameters = {"rope_theta": 1000.0, "rope_type": "dynamic", "factor": 2.0}
|
| 2743 |
+
+ model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', rope_parameters=rope_parameters)
|
| 2744 |
```
|
| 2745 |
|
| 2746 |
### Transformers.js
|